Merge commit 'tip/perfcounters-for-linus' into oprofile/master

author Robert Richter <robert.richter@amd.com>

Fri, 12 Jun 2009 15:58:48 +0000 (17:58 +0200)

committer Robert Richter <robert.richter@amd.com>

Fri, 12 Jun 2009 15:58:48 +0000 (17:58 +0200)
author Robert Richter <robert.richter@amd.com>
Fri, 12 Jun 2009 15:58:48 +0000 (17:58 +0200)
committer Robert Richter <robert.richter@amd.com>
Fri, 12 Jun 2009 15:58:48 +0000 (17:58 +0200)
diff --git a/Documentation/ABI/testing/sysfs-devices-cache_disable b/Documentation/ABI/testing/sysfs-devices-cache_disable

new file mode 100644 (file)

index 0000000..175bb4f
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-cache_disable
@@ -0,0 +1,18 @@
+What:      /sys/devices/system/cpu/cpu*/cache/index*/cache_disable_X
+Date:      August 2008
+KernelVersion: 2.6.27
+Contact:       mark.langsdorf@amd.com
+Description:   These files exist in every cpu's cache index directories.
+               There are currently 2 cache_disable_# files in each
+               directory.  Reading from these files on a supported
+               processor will return that cache disable index value
+               for that processor and node.  Writing to one of these
+               files will cause the specificed cache index to be disabled.
+
+               Currently, only AMD Family 10h Processors support cache index
+               disable, and only for their L3 caches.  See the BIOS and
+               Kernel Developer's Guide at
+               http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/31116-Public-GH-BKDG_3.20_2-4-09.pdf
+               for formatting information and other details on the
+               cache index disable.
+Users:    joachim.deguara@amd.com
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt

index d9aa43d78bcc9fc92e582a62cadb18cd9b27f4f0..25fb8bcf32a276f280d44aa53a4f2712a14257f4 100644 (file)
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -704,12 +704,24 @@ this directory the following files can currently be found:
                                 The current number of free dma_debug_entries
                                 in the allocator.
  
+       dma-api/driver-filter
+                               You can write a name of a driver into this file
+                               to limit the debug output to requests from that
+                               particular driver. Write an empty string to
+                               that file to disable the filter and see
+                               all errors again.
+
  If you have this code compiled into your kernel it will be enabled by default.
  If you want to boot without the bookkeeping anyway you can provide
  'dma_debug=off' as a boot parameter. This will disable DMA-API debugging.
  Notice that you can not enable it again at runtime. You have to reboot to do
  so.
  
+If you want to see debug messages only for a special device driver you can
+specify the dma_debug_driver=<drivername> parameter. This will enable the
+driver filter at boot time. The debug code will only print errors for that
+driver afterwards. This filter can be disabled or changed later using debugfs.
+
  When the code disables itself at runtime this is most likely because it ran
  out of dma_debug_entries. These entries are preallocated at boot. The number
  of preallocated entries is defined per architecture. If it is too low for you
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile

index b1eb661e6302a3e5aeae0a58d3cda169f46f66a2..9632444f6c6270b87894e2381f95985ed3126d7a 100644 (file)
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -13,7 +13,8 @@ DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \
             gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
             genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \
             mac80211.xml debugobjects.xml sh.xml regulator.xml \
-           alsa-driver-api.xml writing-an-alsa-driver.xml
+           alsa-driver-api.xml writing-an-alsa-driver.xml \
+           tracepoint.xml
  
  ###
  # The build process is as follows (targets):
diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl

new file mode 100644 (file)

index 0000000..b0756d0
--- /dev/null
+++ b/Documentation/DocBook/tracepoint.tmpl
@@ -0,0 +1,89 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+       "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="Tracepoints">
+ <bookinfo>
+  <title>The Linux Kernel Tracepoint API</title>
+
+  <authorgroup>
+   <author>
+    <firstname>Jason</firstname>
+    <surname>Baron</surname>
+    <affiliation>
+     <address>
+      <email>jbaron@redhat.com</email>
+     </address>
+    </affiliation>
+   </author>
+  </authorgroup>
+
+  <legalnotice>
+   <para>
+     This documentation is free software; you can redistribute
+     it and/or modify it under the terms of the GNU General Public
+     License as published by the Free Software Foundation; either
+     version 2 of the License, or (at your option) any later
+     version.
+   </para>
+
+   <para>
+     This program is distributed in the hope that it will be
+     useful, but WITHOUT ANY WARRANTY; without even the implied
+     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+     See the GNU General Public License for more details.
+   </para>
+
+   <para>
+     You should have received a copy of the GNU General Public
+     License along with this program; if not, write to the Free
+     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+     MA 02111-1307 USA
+   </para>
+
+   <para>
+     For more details see the file COPYING in the source
+     distribution of Linux.
+   </para>
+  </legalnotice>
+ </bookinfo>
+
+ <toc></toc>
+  <chapter id="intro">
+   <title>Introduction</title>
+   <para>
+     Tracepoints are static probe points that are located in strategic points
+     throughout the kernel. 'Probes' register/unregister with tracepoints
+     via a callback mechanism. The 'probes' are strictly typed functions that
+     are passed a unique set of parameters defined by each tracepoint.
+   </para>
+
+   <para>
+     From this simple callback mechanism, 'probes' can be used to profile, debug,
+     and understand kernel behavior. There are a number of tools that provide a
+     framework for using 'probes'. These tools include Systemtap, ftrace, and
+     LTTng.
+   </para>
+
+   <para>
+     Tracepoints are defined in a number of header files via various macros. Thus,
+     the purpose of this document is to provide a clear accounting of the available
+     tracepoints. The intention is to understand not only what tracepoints are
+     available but also to understand where future tracepoints might be added.
+   </para>
+
+   <para>
+     The API presented has functions of the form:
+     <function>trace_tracepointname(function parameters)</function>. These are the
+     tracepoints callbacks that are found throughout the code. Registering and
+     unregistering probes with these callback sites is covered in the
+     <filename>Documentation/trace/*</filename> directory.
+   </para>
+  </chapter>
+
+  <chapter id="irq">
+   <title>IRQ</title>
+!Iinclude/trace/events/irq.h
+  </chapter>
+
+</book>
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt

index 068848240a8bdf135ba706da5c3d01a43609397a..02cced183b2d63f81428b3d6a21064397c7a7533 100644 (file)
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -192,23 +192,24 @@ rcu/rcuhier (which displays the struct rcu_node hierarchy).
  The output of "cat rcu/rcudata" looks as follows:
  
  rcu:
-  0 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=1 rp=3c2a dt=23301/73 dn=2 df=1882 of=0 ri=2126 ql=2 b=10
-  1 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=3 rp=39a6 dt=78073/1 dn=2 df=1402 of=0 ri=1875 ql=46 b=10
-  2 c=4010 g=4010 pq=1 pqc=4010 qp=0 rpfq=-5 rp=1d12 dt=16646/0 dn=2 df=3140 of=0 ri=2080 ql=0 b=10
-  3 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=2b50 dt=21159/1 dn=2 df=2230 of=0 ri=1923 ql=72 b=10
-  4 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1644 dt=5783/1 dn=2 df=3348 of=0 ri=2805 ql=7 b=10
-  5 c=4012 g=4013 pq=0 pqc=4011 qp=1 rpfq=3 rp=1aac dt=5879/1 dn=2 df=3140 of=0 ri=2066 ql=10 b=10
-  6 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=ed8 dt=5847/1 dn=2 df=3797 of=0 ri=1266 ql=10 b=10
-  7 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1fa2 dt=6199/1 dn=2 df=2795 of=0 ri=2162 ql=28 b=10
+rcu:
+  0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10
+  1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10
+  2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10
+  3 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=6681/1 dn=0 df=1545 of=0 ri=0 ql=0 b=10
+  4 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1003/1 dn=0 df=1992 of=0 ri=0 ql=0 b=10
+  5 c=17829 g=17830 pq=1 pqc=17829 qp=1 dt=3887/1 dn=0 df=3331 of=0 ri=4 ql=2 b=10
+  6 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=859/1 dn=0 df=3224 of=0 ri=0 ql=0 b=10
+  7 c=17829 g=17830 pq=0 pqc=17829 qp=1 dt=3761/1 dn=0 df=1818 of=0 ri=0 ql=2 b=10
  rcu_bh:
-  0 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-145 rp=21d6 dt=23301/73 dn=2 df=0 of=0 ri=0 ql=0 b=10
-  1 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-170 rp=20ce dt=78073/1 dn=2 df=26 of=0 ri=5 ql=0 b=10
-  2 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-83 rp=fbd dt=16646/0 dn=2 df=28 of=0 ri=4 ql=0 b=10
-  3 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-105 rp=178c dt=21159/1 dn=2 df=28 of=0 ri=2 ql=0 b=10
-  4 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-30 rp=b54 dt=5783/1 dn=2 df=32 of=0 ri=0 ql=0 b=10
-  5 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-29 rp=df5 dt=5879/1 dn=2 df=30 of=0 ri=3 ql=0 b=10
-  6 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-28 rp=788 dt=5847/1 dn=2 df=32 of=0 ri=0 ql=0 b=10
-  7 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-53 rp=1098 dt=6199/1 dn=2 df=30 of=0 ri=3 ql=0 b=10
+  0 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=10951/1 dn=0 df=0 of=0 ri=0 ql=0 b=10
+  1 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=16117/1 dn=0 df=13 of=0 ri=0 ql=0 b=10
+  2 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1445/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
+  3 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=6681/1 dn=0 df=9 of=0 ri=0 ql=0 b=10
+  4 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1003/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
+  5 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3887/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
+  6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
+  7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
  
  The first section lists the rcu_data structures for rcu, the second for
  rcu_bh.  Each section has one line per CPU, or eight for this 8-CPU system.
@@ -253,12 +254,6 @@ o  "pqc" indicates which grace period the last-observed quiescent
  o      "qp" indicates that RCU still expects a quiescent state from
         this CPU.
  
-o      "rpfq" is the number of rcu_pending() calls on this CPU required
-       to induce this CPU to invoke force_quiescent_state().
-
-o      "rp" is low-order four hex digits of the count of how many times
-       rcu_pending() has been invoked on this CPU.
-
  o      "dt" is the current value of the dyntick counter that is incremented
         when entering or leaving dynticks idle state, either by the
         scheduler or by irq.  The number after the "/" is the interrupt
@@ -305,6 +300,9 @@ o   "b" is the batch limit for this CPU.  If more than this number
         of RCU callbacks is ready to invoke, then the remainder will
         be deferred.
  
+There is also an rcu/rcudata.csv file with the same information in
+comma-separated-variable spreadsheet format.
+
  
  The output of "cat rcu/rcugp" looks as follows:
  
@@ -411,3 +409,63 @@ o  Each element of the form "1/1 0:127 ^0" represents one struct
                 For example, the first entry at the lowest level shows
                 "^0", indicating that it corresponds to bit zero in
                 the first entry at the middle level.
+
+
+The output of "cat rcu/rcu_pending" looks as follows:
+
+rcu:
+  0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
+  1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
+  2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
+  3 np=236249 qsp=48766 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723
+  4 np=221310 qsp=46850 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110
+  5 np=237332 qsp=48449 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456
+  6 np=219995 qsp=46718 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834
+  7 np=249893 qsp=49390 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888
+rcu_bh:
+  0 np=146741 qsp=1419 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314
+  1 np=155792 qsp=12597 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180
+  2 np=136629 qsp=18680 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936
+  3 np=137723 qsp=2843 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863
+  4 np=123110 qsp=12433 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671
+  5 np=137456 qsp=4210 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235
+  6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921
+  7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542
+
+As always, this is once again split into "rcu" and "rcu_bh" portions.
+The fields are as follows:
+
+o      "np" is the number of times that __rcu_pending() has been invoked
+       for the corresponding flavor of RCU.
+
+o      "qsp" is the number of times that the RCU was waiting for a
+       quiescent state from this CPU.
+
+o      "cbr" is the number of times that this CPU had RCU callbacks
+       that had passed through a grace period, and were thus ready
+       to be invoked.
+
+o      "cng" is the number of times that this CPU needed another
+       grace period while RCU was idle.
+
+o      "gpc" is the number of times that an old grace period had
+       completed, but this CPU was not yet aware of it.
+
+o      "gps" is the number of times that a new grace period had started,
+       but this CPU was not yet aware of it.
+
+o      "nf" is the number of times that this CPU suspected that the
+       current grace period had run for too long, and thus needed to
+       be forced.
+
+       Please note that "forcing" consists of sending resched IPIs
+       to holdout CPUs.  If that CPU really still is in an old RCU
+       read-side critical section, then we really do have to wait for it.
+       The assumption behing "forcing" is that the CPU is not still in
+       an old RCU read-side critical section, but has not yet responded
+       for some other reason.
+
+o      "nn" is the number of times that this CPU needed nothing.  Alert
+       readers will note that the rcu "nn" number for a given CPU very
+       closely matches the rcu_bh "np" number for that same CPU.  This
+       is due to short-circuit evaluation in rcu_pending().
diff --git a/Documentation/futex-requeue-pi.txt b/Documentation/futex-requeue-pi.txt

new file mode 100644 (file)

index 0000000..9dc1ff4
--- /dev/null
+++ b/Documentation/futex-requeue-pi.txt
@@ -0,0 +1,131 @@
+Futex Requeue PI
+----------------
+
+Requeueing of tasks from a non-PI futex to a PI futex requires
+special handling in order to ensure the underlying rt_mutex is never
+left without an owner if it has waiters; doing so would break the PI
+boosting logic [see rt-mutex-desgin.txt] For the purposes of
+brevity, this action will be referred to as "requeue_pi" throughout
+this document.  Priority inheritance is abbreviated throughout as
+"PI".
+
+Motivation
+----------
+
+Without requeue_pi, the glibc implementation of
+pthread_cond_broadcast() must resort to waking all the tasks waiting
+on a pthread_condvar and letting them try to sort out which task
+gets to run first in classic thundering-herd formation.  An ideal
+implementation would wake the highest-priority waiter, and leave the
+rest to the natural wakeup inherent in unlocking the mutex
+associated with the condvar.
+
+Consider the simplified glibc calls:
+
+/* caller must lock mutex */
+pthread_cond_wait(cond, mutex)
+{
+       lock(cond->__data.__lock);
+       unlock(mutex);
+       do {
+          unlock(cond->__data.__lock);
+          futex_wait(cond->__data.__futex);
+          lock(cond->__data.__lock);
+       } while(...)
+       unlock(cond->__data.__lock);
+       lock(mutex);
+}
+
+pthread_cond_broadcast(cond)
+{
+       lock(cond->__data.__lock);
+       unlock(cond->__data.__lock);
+       futex_requeue(cond->data.__futex, cond->mutex);
+}
+
+Once pthread_cond_broadcast() requeues the tasks, the cond->mutex
+has waiters. Note that pthread_cond_wait() attempts to lock the
+mutex only after it has returned to user space.  This will leave the
+underlying rt_mutex with waiters, and no owner, breaking the
+previously mentioned PI-boosting algorithms.
+
+In order to support PI-aware pthread_condvar's, the kernel needs to
+be able to requeue tasks to PI futexes.  This support implies that
+upon a successful futex_wait system call, the caller would return to
+user space already holding the PI futex.  The glibc implementation
+would be modified as follows:
+
+
+/* caller must lock mutex */
+pthread_cond_wait_pi(cond, mutex)
+{
+       lock(cond->__data.__lock);
+       unlock(mutex);
+       do {
+          unlock(cond->__data.__lock);
+          futex_wait_requeue_pi(cond->__data.__futex);
+          lock(cond->__data.__lock);
+       } while(...)
+       unlock(cond->__data.__lock);
+        /* the kernel acquired the the mutex for us */
+}
+
+pthread_cond_broadcast_pi(cond)
+{
+       lock(cond->__data.__lock);
+       unlock(cond->__data.__lock);
+       futex_requeue_pi(cond->data.__futex, cond->mutex);
+}
+
+The actual glibc implementation will likely test for PI and make the
+necessary changes inside the existing calls rather than creating new
+calls for the PI cases.  Similar changes are needed for
+pthread_cond_timedwait() and pthread_cond_signal().
+
+Implementation
+--------------
+
+In order to ensure the rt_mutex has an owner if it has waiters, it
+is necessary for both the requeue code, as well as the waiting code,
+to be able to acquire the rt_mutex before returning to user space.
+The requeue code cannot simply wake the waiter and leave it to
+acquire the rt_mutex as it would open a race window between the
+requeue call returning to user space and the waiter waking and
+starting to run.  This is especially true in the uncontended case.
+
+The solution involves two new rt_mutex helper routines,
+rt_mutex_start_proxy_lock() and rt_mutex_finish_proxy_lock(), which
+allow the requeue code to acquire an uncontended rt_mutex on behalf
+of the waiter and to enqueue the waiter on a contended rt_mutex.
+Two new system calls provide the kernel<->user interface to
+requeue_pi: FUTEX_WAIT_REQUEUE_PI and FUTEX_REQUEUE_CMP_PI.
+
+FUTEX_WAIT_REQUEUE_PI is called by the waiter (pthread_cond_wait()
+and pthread_cond_timedwait()) to block on the initial futex and wait
+to be requeued to a PI-aware futex.  The implementation is the
+result of a high-speed collision between futex_wait() and
+futex_lock_pi(), with some extra logic to check for the additional
+wake-up scenarios.
+
+FUTEX_REQUEUE_CMP_PI is called by the waker
+(pthread_cond_broadcast() and pthread_cond_signal()) to requeue and
+possibly wake the waiting tasks. Internally, this system call is
+still handled by futex_requeue (by passing requeue_pi=1).  Before
+requeueing, futex_requeue() attempts to acquire the requeue target
+PI futex on behalf of the top waiter.  If it can, this waiter is
+woken.  futex_requeue() then proceeds to requeue the remaining
+nr_wake+nr_requeue tasks to the PI futex, calling
+rt_mutex_start_proxy_lock() prior to each requeue to prepare the
+task as a waiter on the underlying rt_mutex.  It is possible that
+the lock can be acquired at this stage as well, if so, the next
+waiter is woken to finish the acquisition of the lock.
+
+FUTEX_REQUEUE_PI accepts nr_wake and nr_requeue as arguments, but
+their sum is all that really matters.  futex_requeue() will wake or
+requeue up to nr_wake + nr_requeue tasks.  It will wake only as many
+tasks as it can acquire the lock for, which in the majority of cases
+should be 0 as good programming practice dictates that the caller of
+either pthread_cond_broadcast() or pthread_cond_signal() acquire the
+mutex prior to making the call. FUTEX_REQUEUE_PI requires that
+nr_wake=1.  nr_requeue should be INT_MAX for broadcast and 0 for
+signal.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index 19d83e1fc017e8aafd7a6a36bd2395ca26f48ef1..4a3c2209a124b9d34f5305e66992cbc350e1df96 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -56,7 +56,6 @@ parameter is applicable:
         ISAPNP  ISA PnP code is enabled.
         ISDN    Appropriate ISDN support is enabled.
         JOY     Appropriate joystick support is enabled.
-       KMEMTRACE kmemtrace is enabled.
         LIBATA  Libata driver is enabled
         LP      Printer support is enabled.
         LOOP    Loopback device support is enabled.
@@ -329,11 +328,6 @@ and is between 256 and 4096 characters. It is defined in the file
                                     flushed before they will be reused, which
                                     is a lot of faster
  
-       amd_iommu_size= [HW,X86-64]
-                       Define the size of the aperture for the AMD IOMMU
-                       driver. Possible values are:
-                       '32M', '64M' (default), '128M', '256M', '512M', '1G'
-
         amijoy.map=     [HW,JOY] Amiga joystick support
                         Map of devices attached to JOY0DAT and JOY1DAT
                         Format: <a>,<b>
@@ -646,6 +640,13 @@ and is between 256 and 4096 characters. It is defined in the file
                         DMA-API debugging code disables itself because the
                         architectural default is too low.
  
+       dma_debug_driver=<driver_name>
+                       With this option the DMA-API debugging driver
+                       filter feature can be enabled at boot time. Just
+                       pass the driver to filter for as the parameter.
+                       The filter can be disabled or changed to another
+                       driver later using sysfs.
+
         dscc4.setup=    [NET]
  
         dtc3181e=       [HW,SCSI]
@@ -752,12 +753,25 @@ and is between 256 and 4096 characters. It is defined in the file
                         ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
  
         ftrace=[tracer]
-                       [ftrace] will set and start the specified tracer
+                       [FTRACE] will set and start the specified tracer
                         as early as possible in order to facilitate early
                         boot debugging.
  
         ftrace_dump_on_oops
-                       [ftrace] will dump the trace buffers on oops.
+                       [FTRACE] will dump the trace buffers on oops.
+
+       ftrace_filter=[function-list]
+                       [FTRACE] Limit the functions traced by the function
+                       tracer at boot up. function-list is a comma separated
+                       list of functions. This list can be changed at run
+                       time by the set_ftrace_filter file in the debugfs
+                       tracing directory. 
+
+       ftrace_notrace=[function-list]
+                       [FTRACE] Do not trace the functions specified in
+                       function-list. This list can be changed at run time
+                       by the set_ftrace_notrace file in the debugfs
+                       tracing directory.
  
         gamecon.map[2|3]=
                         [HW,JOY] Multisystem joystick and NES/SNES/PSX pad
@@ -1054,15 +1068,6 @@ and is between 256 and 4096 characters. It is defined in the file
                         use the HighMem zone if it exists, and the Normal
                         zone if it does not.
  
-       kmemtrace.enable=       [KNL,KMEMTRACE] Format: { yes | no }
-                               Controls whether kmemtrace is enabled
-                               at boot-time.
-
-       kmemtrace.subbufs=n     [KNL,KMEMTRACE] Overrides the number of
-                       subbufs kmemtrace's relay channel has. Set this
-                       higher than default (KMEMTRACE_N_SUBBUFS in code) if
-                       you experience buffer overruns.
-
         kgdboc=         [HW] kgdb over consoles.
                         Requires a tty driver that supports console polling.
                         (only serial suported for now)
@@ -1575,6 +1580,9 @@ and is between 256 and 4096 characters. It is defined in the file
         noinitrd        [RAM] Tells the kernel not to load any configured
                         initial RAM disk.
  
+       nointremap      [X86-64, Intel-IOMMU] Do not enable interrupt
+                       remapping.
+
         nointroute      [IA-64]
  
         nojitter        [IA64] Disables jitter checking for ITC timers.
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt

index f5b7127f54acb6af1d9f997a40e099b60c0b7571..7f5809eddee62eaf524103eeda485f5e553d5a01 100644 (file)
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -31,6 +31,7 @@ Contents:
  
       - Locking functions.
       - Interrupt disabling functions.
+     - Sleep and wake-up functions.
       - Miscellaneous functions.
  
   (*) Inter-CPU locking barrier effects.
@@ -1217,6 +1218,132 @@ barriers are required in such a situation, they must be provided from some
  other means.
  
  
+SLEEP AND WAKE-UP FUNCTIONS
+---------------------------
+
+Sleeping and waking on an event flagged in global data can be viewed as an
+interaction between two pieces of data: the task state of the task waiting for
+the event and the global data used to indicate the event.  To make sure that
+these appear to happen in the right order, the primitives to begin the process
+of going to sleep, and the primitives to initiate a wake up imply certain
+barriers.
+
+Firstly, the sleeper normally follows something like this sequence of events:
+
+       for (;;) {
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               if (event_indicated)
+                       break;
+               schedule();
+       }
+
+A general memory barrier is interpolated automatically by set_current_state()
+after it has altered the task state:
+
+       CPU 1
+       ===============================
+       set_current_state();
+         set_mb();
+           STORE current->state
+           <general barrier>
+       LOAD event_indicated
+
+set_current_state() may be wrapped by:
+
+       prepare_to_wait();
+       prepare_to_wait_exclusive();
+
+which therefore also imply a general memory barrier after setting the state.
+The whole sequence above is available in various canned forms, all of which
+interpolate the memory barrier in the right place:
+
+       wait_event();
+       wait_event_interruptible();
+       wait_event_interruptible_exclusive();
+       wait_event_interruptible_timeout();
+       wait_event_killable();
+       wait_event_timeout();
+       wait_on_bit();
+       wait_on_bit_lock();
+
+
+Secondly, code that performs a wake up normally follows something like this:
+
+       event_indicated = 1;
+       wake_up(&event_wait_queue);
+
+or:
+
+       event_indicated = 1;
+       wake_up_process(event_daemon);
+
+A write memory barrier is implied by wake_up() and co. if and only if they wake
+something up.  The barrier occurs before the task state is cleared, and so sits
+between the STORE to indicate the event and the STORE to set TASK_RUNNING:
+
+       CPU 1                           CPU 2
+       =============================== ===============================
+       set_current_state();            STORE event_indicated
+         set_mb();                     wake_up();
+           STORE current->state          <write barrier>
+           <general barrier>             STORE current->state
+       LOAD event_indicated
+
+The available waker functions include:
+
+       complete();
+       wake_up();
+       wake_up_all();
+       wake_up_bit();
+       wake_up_interruptible();
+       wake_up_interruptible_all();
+       wake_up_interruptible_nr();
+       wake_up_interruptible_poll();
+       wake_up_interruptible_sync();
+       wake_up_interruptible_sync_poll();
+       wake_up_locked();
+       wake_up_locked_poll();
+       wake_up_nr();
+       wake_up_poll();
+       wake_up_process();
+
+
+[!] Note that the memory barriers implied by the sleeper and the waker do _not_
+order multiple stores before the wake-up with respect to loads of those stored
+values after the sleeper has called set_current_state().  For instance, if the
+sleeper does:
+
+       set_current_state(TASK_INTERRUPTIBLE);
+       if (event_indicated)
+               break;
+       __set_current_state(TASK_RUNNING);
+       do_something(my_data);
+
+and the waker does:
+
+       my_data = value;
+       event_indicated = 1;
+       wake_up(&event_wait_queue);
+
+there's no guarantee that the change to event_indicated will be perceived by
+the sleeper as coming after the change to my_data.  In such a circumstance, the
+code on both sides must interpolate its own memory barriers between the
+separate data accesses.  Thus the above sleeper ought to do:
+
+       set_current_state(TASK_INTERRUPTIBLE);
+       if (event_indicated) {
+               smp_rmb();
+               do_something(my_data);
+       }
+
+and the waker should do:
+
+       my_data = value;
+       smp_wmb();
+       event_indicated = 1;
+       wake_up(&event_wait_queue);
+
+
  MISCELLANEOUS FUNCTIONS
  -----------------------
  
@@ -1366,7 +1493,7 @@ WHERE ARE MEMORY BARRIERS NEEDED?
  
  Under normal operation, memory operation reordering is generally not going to
  be a problem as a single-threaded linear piece of code will still appear to
-work correctly, even if it's in an SMP kernel.  There are, however, three
+work correctly, even if it's in an SMP kernel.  There are, however, four
  circumstances in which reordering definitely _could_ be a problem:
  
   (*) Interprocessor interaction.
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt

index 5ba4d3fc625a424b341bfa5b5853473eaf80fe6a..1df7f9cdab0576227671703c6fec1fc780ec596b 100644 (file)
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -4,6 +4,7 @@
  CONTENTS
  ========
  
+0. WARNING
  1. Overview
    1.1 The problem
    1.2 The solution
@@ -14,6 +15,23 @@ CONTENTS
  3. Future plans
  
  
+0. WARNING
+==========
+
+ Fiddling with these settings can result in an unstable system, the knobs are
+ root only and assumes root knows what he is doing.
+
+Most notable:
+
+ * very small values in sched_rt_period_us can result in an unstable
+   system when the period is smaller than either the available hrtimer
+   resolution, or the time it takes to handle the budget refresh itself.
+
+ * very small values in sched_rt_runtime_us can result in an unstable
+   system when the runtime is so small the system has difficulty making
+   forward progress (NOTE: the migration thread and kstopmachine both
+   are real-time processes).
+
  1. Overview
  ===========
  
@@ -169,7 +187,7 @@ get their allocated time.
  
  Implementing SCHED_EDF might take a while to complete. Priority Inheritance is
  the biggest challenge as the current linux PI infrastructure is geared towards
-the limited static priority levels 0-139. With deadline scheduling you need to
+the limited static priority levels 0-99. With deadline scheduling you need to
  do deadline inheritance (since priority is inversely proportional to the
  deadline delta (deadline - now).
  
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt

new file mode 100644 (file)

index 0000000..f157d75
--- /dev/null
+++ b/Documentation/trace/events.txt
@@ -0,0 +1,90 @@
+                            Event Tracing
+
+               Documentation written by Theodore Ts'o
+                       Updated by Li Zefan
+
+1. Introduction
+===============
+
+Tracepoints (see Documentation/trace/tracepoints.txt) can be used
+without creating custom kernel modules to register probe functions
+using the event tracing infrastructure.
+
+Not all tracepoints can be traced using the event tracing system;
+the kernel developer must provide code snippets which define how the
+tracing information is saved into the tracing buffer, and how the
+tracing information should be printed.
+
+2. Using Event Tracing
+======================
+
+2.1 Via the 'set_event' interface
+---------------------------------
+
+The events which are available for tracing can be found in the file
+/debug/tracing/available_events.
+
+To enable a particular event, such as 'sched_wakeup', simply echo it
+to /debug/tracing/set_event. For example:
+
+       # echo sched_wakeup >> /debug/tracing/set_event
+
+[ Note: '>>' is necessary, otherwise it will firstly disable
+  all the events. ]
+
+To disable an event, echo the event name to the set_event file prefixed
+with an exclamation point:
+
+       # echo '!sched_wakeup' >> /debug/tracing/set_event
+
+To disable all events, echo an empty line to the set_event file:
+
+       # echo > /debug/tracing/set_event
+
+To enable all events, echo '*:*' or '*:' to the set_event file:
+
+       # echo *:* > /debug/tracing/set_event
+
+The events are organized into subsystems, such as ext4, irq, sched,
+etc., and a full event name looks like this: <subsystem>:<event>.  The
+subsystem name is optional, but it is displayed in the available_events
+file.  All of the events in a subsystem can be specified via the syntax
+"<subsystem>:*"; for example, to enable all irq events, you can use the
+command:
+
+       # echo 'irq:*' > /debug/tracing/set_event
+
+2.2 Via the 'enable' toggle
+---------------------------
+
+The events available are also listed in /debug/tracing/events/ hierarchy
+of directories.
+
+To enable event 'sched_wakeup':
+
+       # echo 1 > /debug/tracing/events/sched/sched_wakeup/enable
+
+To disable it:
+
+       # echo 0 > /debug/tracing/events/sched/sched_wakeup/enable
+
+To enable all events in sched subsystem:
+
+       # echo 1 > /debug/tracing/events/sched/enable
+
+To eanble all events:
+
+       # echo 1 > /debug/tracing/events/enable
+
+When reading one of these enable files, there are four results:
+
+ 0 - all events this file affects are disabled
+ 1 - all events this file affects are enabled
+ X - there is a mixture of events enabled and disabled
+ ? - this file does not affect any event
+
+3. Defining an event-enabled tracepoint
+=======================================
+
+See The example provided in samples/trace_events
+
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt

index fd9a3e69381351aeeaa6712d07c5170aa7dca1a4..2a82d8602944abca930635cff44f37c52238529d 100644 (file)
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -179,7 +179,7 @@ Here is the list of current tracers that may be configured.
  
         Function call tracer to trace all kernel functions.
  
-  "function_graph_tracer"
+  "function_graph"
  
         Similar to the function tracer except that the
         function tracer probes the functions on their entry
@@ -518,9 +518,18 @@ priority with zero (0) being the highest priority and the nice
  values starting at 100 (nice -20). Below is a quick chart to map
  the kernel priority to user land priorities.
  
-  Kernel priority: 0 to 99    ==> user RT priority 99 to 0
-  Kernel priority: 100 to 139 ==> user nice -20 to 19
-  Kernel priority: 140        ==> idle task priority
+   Kernel Space                     User Space
+ ===============================================================
+   0(high) to  98(low)     user RT priority 99(high) to 1(low)
+                           with SCHED_RR or SCHED_FIFO
+ ---------------------------------------------------------------
+  99                       sched_priority is not used in scheduling
+                           decisions(it must be specified as 0)
+ ---------------------------------------------------------------
+ 100(high) to 139(low)     user nice -20(high) to 19(low)
+ ---------------------------------------------------------------
+ 140                       idle task priority
+ ---------------------------------------------------------------
  
  The task states are:
  
diff --git a/Documentation/trace/power.txt b/Documentation/trace/power.txt

new file mode 100644 (file)

index 0000000..cd805e1
--- /dev/null
+++ b/Documentation/trace/power.txt
@@ -0,0 +1,17 @@
+The power tracer collects detailed information about C-state and P-state
+transitions, instead of just looking at the high-level "average"
+information.
+
+There is a helper script found in scrips/tracing/power.pl in the kernel
+sources which can be used to parse this information and create a
+Scalable Vector Graphics (SVG) picture from the trace data.
+
+To use this tracer:
+
+       echo 0 > /sys/kernel/debug/tracing/tracing_enabled
+       echo power > /sys/kernel/debug/tracing/current_tracer
+       echo 1 > /sys/kernel/debug/tracing/tracing_enabled
+       sleep 1
+       echo 0 > /sys/kernel/debug/tracing/tracing_enabled
+       cat /sys/kernel/debug/tracing/trace | \
+               perl scripts/tracing/power.pl > out.sv
diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt

index e0203662f9e9f8a1a907060437f3fcdf907e2667..8da3a795083fec448cb7e13831b0c10180ff0b6e 100644 (file)
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -50,6 +50,10 @@ Protocol 2.08:       (Kernel 2.6.26) Added crc32 checksum and ELF format
  Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical
                 pointer to single linked list of struct setup_data.
  
+Protocol 2.10: (Kernel 2.6.31) Added a protocol for relaxed alignment
+               beyond the kernel_alignment added, new init_size and
+               pref_address fields.  Added extended boot loader IDs.
+
  **** MEMORY LAYOUT
  
  The traditional memory map for the kernel loader, used for Image or
@@ -168,12 +172,13 @@ Offset    Proto   Name            Meaning
  021C/4 2.00+   ramdisk_size    initrd size (set by boot loader)
  0220/4 2.00+   bootsect_kludge DO NOT USE - for bootsect.S use only
  0224/2 2.01+   heap_end_ptr    Free memory after setup end
-0226/2 N/A     pad1            Unused
+0226/1 2.02+(3 ext_loader_ver  Extended boot loader version
+0227/1 2.02+(3 ext_loader_type Extended boot loader ID
  0228/4 2.02+   cmd_line_ptr    32-bit pointer to the kernel command line
  022C/4 2.03+   ramdisk_max     Highest legal initrd address
  0230/4 2.05+   kernel_alignment Physical addr alignment required for kernel
  0234/1 2.05+   relocatable_kernel Whether kernel is relocatable or not
-0235/1 N/A     pad2            Unused
+0235/1 2.10+   min_alignment   Minimum alignment, as a power of two
  0236/2 N/A     pad3            Unused
  0238/4 2.06+   cmdline_size    Maximum size of the kernel command line
  023C/4 2.07+   hardware_subarch Hardware subarchitecture
@@ -182,6 +187,8 @@ Offset      Proto   Name            Meaning
  024C/4 2.08+   payload_length  Length of kernel payload
  0250/8 2.09+   setup_data      64-bit physical pointer to linked list
                                 of struct setup_data
+0258/8 2.10+   pref_address    Preferred loading address
+0260/4 2.10+   init_size       Linear memory required during initialization
  
  (1) For backwards compatibility, if the setup_sects field contains 0, the
      real value is 4.
@@ -190,6 +197,8 @@ Offset      Proto   Name            Meaning
      field are unusable, which means the size of a bzImage kernel
      cannot be determined.
  
+(3) Ignored, but safe to set, for boot protocols 2.02-2.09.
+
  If the "HdrS" (0x53726448) magic number is not found at offset 0x202,
  the boot protocol version is "old".  Loading an old kernel, the
  following parameters should be assumed:
@@ -343,18 +352,32 @@ Protocol: 2.00+
    0xTV here, where T is an identifier for the boot loader and V is
    a version number.  Otherwise, enter 0xFF here.
  
+  For boot loader IDs above T = 0xD, write T = 0xE to this field and
+  write the extended ID minus 0x10 to the ext_loader_type field.
+  Similarly, the ext_loader_ver field can be used to provide more than
+  four bits for the bootloader version.
+
+  For example, for T = 0x15, V = 0x234, write:
+
+  type_of_loader  <- 0xE4
+  ext_loader_type <- 0x05
+  ext_loader_ver  <- 0x23
+
    Assigned boot loader ids:
         0  LILO                 (0x00 reserved for pre-2.00 bootloader)
         1  Loadlin
         2  bootsect-loader      (0x20, all other values reserved)
-       3  SYSLINUX
-       4  EtherBoot
+       3  Syslinux
+       4  Etherboot/gPXE
         5  ELILO
         7  GRUB
-       8  U-BOOT
+       8  U-Boot
         9  Xen
         A  Gujin
         B  Qemu
+       C  Arcturus Networks uCbootloader
+       E  Extended             (see ext_loader_type)
+       F  Special              (0xFF = undefined)
  
    Please contact <hpa@zytor.com> if you need a bootloader ID
    value assigned.
@@ -453,6 +476,35 @@ Protocol:  2.01+
    Set this field to the offset (from the beginning of the real-mode
    code) of the end of the setup stack/heap, minus 0x0200.
  
+Field name:    ext_loader_ver
+Type:          write (optional)
+Offset/size:   0x226/1
+Protocol:      2.02+
+
+  This field is used as an extension of the version number in the
+  type_of_loader field.  The total version number is considered to be
+  (type_of_loader & 0x0f) + (ext_loader_ver << 4).
+
+  The use of this field is boot loader specific.  If not written, it
+  is zero.
+
+  Kernels prior to 2.6.31 did not recognize this field, but it is safe
+  to write for protocol version 2.02 or higher.
+
+Field name:    ext_loader_type
+Type:          write (obligatory if (type_of_loader & 0xf0) == 0xe0)
+Offset/size:   0x227/1
+Protocol:      2.02+
+
+  This field is used as an extension of the type number in
+  type_of_loader field.  If the type in type_of_loader is 0xE, then
+  the actual type is (ext_loader_type + 0x10).
+
+  This field is ignored if the type in type_of_loader is not 0xE.
+
+  Kernels prior to 2.6.31 did not recognize this field, but it is safe
+  to write for protocol version 2.02 or higher.
+
  Field name:    cmd_line_ptr
  Type:          write (obligatory)
  Offset/size:   0x228/4
@@ -482,11 +534,19 @@ Protocol: 2.03+
    0x37FFFFFF, you can start your ramdisk at 0x37FE0000.)
  
  Field name:    kernel_alignment
-Type:          read (reloc)
+Type:          read/modify (reloc)
  Offset/size:   0x230/4
-Protocol:      2.05+
+Protocol:      2.05+ (read), 2.10+ (modify)
+
+  Alignment unit required by the kernel (if relocatable_kernel is
+  true.)  A relocatable kernel that is loaded at an alignment
+  incompatible with the value in this field will be realigned during
+  kernel initialization.
  
-  Alignment unit required by the kernel (if relocatable_kernel is true.)
+  Starting with protocol version 2.10, this reflects the kernel
+  alignment preferred for optimal performance; it is possible for the
+  loader to modify this field to permit a lesser alignment.  See the
+  min_alignment and pref_address field below.
  
  Field name:    relocatable_kernel
  Type:          read (reloc)
@@ -498,6 +558,22 @@ Protocol:  2.05+
    After loading, the boot loader must set the code32_start field to
    point to the loaded code, or to a boot loader hook.
  
+Field name:    min_alignment
+Type:          read (reloc)
+Offset/size:   0x235/1
+Protocol:      2.10+
+
+  This field, if nonzero, indicates as a power of two the minimum
+  alignment required, as opposed to preferred, by the kernel to boot.
+  If a boot loader makes use of this field, it should update the
+  kernel_alignment field with the alignment unit desired; typically:
+
+       kernel_alignment = 1 << min_alignment
+
+  There may be a considerable performance cost with an excessively
+  misaligned kernel.  Therefore, a loader should typically try each
+  power-of-two alignment from kernel_alignment down to this alignment.
+
  Field name:    cmdline_size
  Type:          read
  Offset/size:   0x238/4
@@ -582,6 +658,36 @@ Protocol:  2.09+
    sure to consider the case where the linked list already contains
    entries.
  
+Field name:    pref_address
+Type:          read (reloc)
+Offset/size:   0x258/8
+Protocol:      2.10+
+
+  This field, if nonzero, represents a preferred load address for the
+  kernel.  A relocating bootloader should attempt to load at this
+  address if possible.
+
+  A non-relocatable kernel will unconditionally move itself and to run
+  at this address.
+
+Field name:    init_size
+Type:          read
+Offset/size:   0x25c/4
+
+  This field indicates the amount of linear contiguous memory starting
+  at the kernel runtime start address that the kernel needs before it
+  is capable of examining its memory map.  This is not the same thing
+  as the total amount of memory the kernel needs to boot, but it can
+  be used by a relocating boot loader to help select a safe load
+  address for the kernel.
+
+  The kernel runtime start address is determined by the following algorithm:
+
+  if (relocatable_kernel)
+       runtime_start = align_up(load_address, kernel_alignment)
+  else
+       runtime_start = pref_address
+
  
  **** THE IMAGE CHECKSUM
  
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt

index 34c13040a718f6febbccadc1f32bc24f4a060664..2db5893d6c97234ded88dbc8b52daaa8defc3af4 100644 (file)
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -150,11 +150,6 @@ NUMA
                 Otherwise, the remaining system RAM is allocated to an
                 additional node.
  
-  numa=hotadd=percent
-               Only allow hotadd memory to preallocate page structures upto
-               percent of already available memory.
-               numa=hotadd=0 will disable hotadd memory.
-
  ACPI
  
    acpi=off     Don't enable ACPI
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt

index 29b52b14d0b43950fa79f3948b1fbe4789396fbf..d6498e3cd7133c7d88279c26d8618aa5fc3d10ad 100644 (file)
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -6,10 +6,11 @@ Virtual memory map with 4 level page tables:
  0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
  hole caused by [48:63] sign extension
  ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole
-ffff880000000000 - ffffc0ffffffffff (=57 TB) direct mapping of all phys. memory
-ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole
-ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space
-ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB)
+ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
+ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
+ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
+ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
+ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
  ... unused hole ...
  ffffffff80000000 - ffffffffa0000000 (=512 MB)  kernel text mapping, from phys 0
  ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space
diff --git a/MAINTAINERS b/MAINTAINERS

index cf4abddfc8a40dcc161e851c26294480fff3076f..fd24af2ff32613ab87ffed9bd74e379c602b6191 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4392,6 +4392,16 @@ S:       Maintained
  F:     include/linux/delayacct.h
  F:     kernel/delayacct.c
  
+PERFORMANCE COUNTER SUBSYSTEM
+P:     Peter Zijlstra
+M:     a.p.zijlstra@chello.nl
+P:     Paul Mackerras
+M:     paulus@samba.org
+P:     Ingo Molnar
+M:     mingo@elte.hu
+L:     linux-kernel@vger.kernel.org
+S:     Supported
+
  PERSONALITY HANDLING
  P:     Christoph Hellwig
  M:     hch@infradead.org
diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c

index 9c9d1fd4155fc5e736c98d89e68d034cd8f2b9ef..5bd5259324b7c8827adb237facd7bf120edd4ca2 100644 (file)
--- a/arch/alpha/kernel/sys_dp264.c
+++ b/arch/alpha/kernel/sys_dp264.c
@@ -176,22 +176,26 @@ cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
         }
  }
  
-static void
+static int
  dp264_set_affinity(unsigned int irq, const struct cpumask *affinity)
  { 
         spin_lock(&dp264_irq_lock);
         cpu_set_irq_affinity(irq, *affinity);
         tsunami_update_irq_hw(cached_irq_mask);
         spin_unlock(&dp264_irq_lock);
+
+       return 0;
  }
  
-static void
+static int
  clipper_set_affinity(unsigned int irq, const struct cpumask *affinity)
  { 
         spin_lock(&dp264_irq_lock);
         cpu_set_irq_affinity(irq - 16, *affinity);
         tsunami_update_irq_hw(cached_irq_mask);
         spin_unlock(&dp264_irq_lock);
+
+       return 0;
  }
  
  static struct hw_interrupt_type dp264_irq_type = {
diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c

index 27f840a4ad3d7ec0949ac92a55ef5c7c1a3b9280..8dd239ebdb9e2cc3489c3d128402a9da37033a29 100644 (file)
--- a/arch/alpha/kernel/sys_titan.c
+++ b/arch/alpha/kernel/sys_titan.c
@@ -157,13 +157,15 @@ titan_cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
  
  }
  
-static void
+static int
  titan_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
  { 
         spin_lock(&titan_irq_lock);
         titan_cpu_set_irq_affinity(irq - 16, *affinity);
         titan_update_irq_hw(titan_cached_irq_mask);
         spin_unlock(&titan_irq_lock);
+
+       return 0;
  }
  
  static void
diff --git a/arch/arm/common/gic.c b/arch/arm/common/gic.c

index 3e1714c6523f06f4ab0572886edba151f21ffb8c..664c7b8b1ba87d7d5268c661808636a27b4f610c 100644 (file)
--- a/arch/arm/common/gic.c
+++ b/arch/arm/common/gic.c
@@ -109,7 +109,7 @@ static void gic_unmask_irq(unsigned int irq)
  }
  
  #ifdef CONFIG_SMP
-static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
+static int gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
  {
         void __iomem *reg = gic_dist_base(irq) + GIC_DIST_TARGET + (gic_irq(irq) & ~3);
         unsigned int shift = (irq % 4) * 8;
@@ -122,6 +122,8 @@ static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
         val |= 1 << (cpu + shift);
         writel(val, reg);
         spin_unlock(&irq_controller_lock);
+
+       return 0;
  }
  #endif
  
diff --git a/arch/cris/arch-v32/kernel/irq.c b/arch/cris/arch-v32/kernel/irq.c

index df3925cb1c7fbbcab86f8445491d0e77ebf684a9..d70b445f4a8f0fc82922e848ae4f0bbbae6a1fb5 100644 (file)
--- a/arch/cris/arch-v32/kernel/irq.c
+++ b/arch/cris/arch-v32/kernel/irq.c
@@ -325,12 +325,14 @@ static void end_crisv32_irq(unsigned int irq)
  {
  }
  
-void set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest)
+int set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest)
  {
         unsigned long flags;
         spin_lock_irqsave(&irq_lock, flags);
         irq_allocations[irq - FIRST_IRQ].mask = *dest;
         spin_unlock_irqrestore(&irq_lock, flags);
+
+       return 0;
  }
  
  static struct irq_chip crisv32_irq_type = {
diff --git a/arch/ia64/hp/sim/hpsim_irq.c b/arch/ia64/hp/sim/hpsim_irq.c

index cc0a3182db3c0158d246f658065a0f0f5d426a5f..acb5047ab57341d9818160f361db47c7e6e59313 100644 (file)
--- a/arch/ia64/hp/sim/hpsim_irq.c
+++ b/arch/ia64/hp/sim/hpsim_irq.c
@@ -21,9 +21,10 @@ hpsim_irq_noop (unsigned int irq)
  {
  }
  
-static void
+static int
  hpsim_set_affinity_noop(unsigned int a, const struct cpumask *b)
  {
+       return 0;
  }
  
  static struct hw_interrupt_type irq_type_hp_sim = {
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c

index 5510317db37b2a439711bb3fa97d6af0c7cd00bf..baec6f00f7f3feee71c5110b01f12b5c8b3f4fda 100644 (file)
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -636,7 +636,7 @@ void __init acpi_numa_arch_fixup(void)
   * success: return IRQ number (>=0)
   * failure: return < 0
   */
-int acpi_register_gsi(u32 gsi, int triggering, int polarity)
+int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
  {
         if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM)
                 return gsi;
@@ -678,7 +678,8 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
  
         fadt = (struct acpi_table_fadt *)fadt_header;
  
-       acpi_register_gsi(fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW);
+       acpi_register_gsi(NULL, fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE,
+                                ACPI_ACTIVE_LOW);
         return 0;
  }
  
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c

index 166e0d839fa04d753fa3709bfe62544eee32e9ec..f92cef47bf862ed8cd3943875585535e0c412763 100644 (file)
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -329,7 +329,7 @@ unmask_irq (unsigned int irq)
  }
  
  
-static void
+static int
  iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
  {
  #ifdef CONFIG_SMP
@@ -343,15 +343,15 @@ iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
  
         cpu = cpumask_first_and(cpu_online_mask, mask);
         if (cpu >= nr_cpu_ids)
-               return;
+               return -1;
  
         if (irq_prepare_move(irq, cpu))
-               return;
+               return -1;
  
         dest = cpu_physical_id(cpu);
  
         if (!iosapic_intr_info[irq].count)
-               return;                 /* not an IOSAPIC interrupt */
+               return -1;                      /* not an IOSAPIC interrupt */
  
         set_irq_affinity_info(irq, dest, redir);
  
@@ -376,7 +376,9 @@ iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
                 iosapic_write(iosapic, IOSAPIC_RTE_HIGH(rte_index), high32);
                 iosapic_write(iosapic, IOSAPIC_RTE_LOW(rte_index), low32);
         }
+
  #endif
+       return 0;
  }
  
  /*
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c

index 2b15e233f7fef6b016f50367c90f9c91ba096b63..0f8ade9331badf4acaf29818dcad0dc67b31fd5e 100644 (file)
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -12,7 +12,7 @@
  static struct irq_chip ia64_msi_chip;
  
  #ifdef CONFIG_SMP
-static void ia64_set_msi_irq_affinity(unsigned int irq,
+static int ia64_set_msi_irq_affinity(unsigned int irq,
                                       const cpumask_t *cpu_mask)
  {
         struct msi_msg msg;
@@ -20,10 +20,10 @@ static void ia64_set_msi_irq_affinity(unsigned int irq,
         int cpu = first_cpu(*cpu_mask);
  
         if (!cpu_online(cpu))
-               return;
+               return -1;
  
         if (irq_prepare_move(irq, cpu))
-               return;
+               return -1;
  
         read_msi_msg(irq, &msg);
  
@@ -39,6 +39,8 @@ static void ia64_set_msi_irq_affinity(unsigned int irq,
  
         write_msi_msg(irq, &msg);
         cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
+
+       return 0;
  }
  #endif /* CONFIG_SMP */
  
@@ -130,17 +132,17 @@ void arch_teardown_msi_irq(unsigned int irq)
  
  #ifdef CONFIG_DMAR
  #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
  {
         struct irq_cfg *cfg = irq_cfg + irq;
         struct msi_msg msg;
         int cpu = cpumask_first(mask);
  
         if (!cpu_online(cpu))
-               return;
+               return -1;
  
         if (irq_prepare_move(irq, cpu))
-               return;
+               return -1;
  
         dmar_msi_read(irq, &msg);
  
@@ -151,6 +153,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
  
         dmar_msi_write(irq, &msg);
         cpumask_copy(irq_desc[irq].affinity, mask);
+
+       return 0;
  }
  #endif /* CONFIG_SMP */
  
diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c

index 66fd705e82c09ee1e1905356cf447526461c086a..764f26abac05df5e9d7659ff7254da6fde398301 100644 (file)
--- a/arch/ia64/sn/kernel/irq.c
+++ b/arch/ia64/sn/kernel/irq.c
@@ -227,7 +227,7 @@ finish_up:
         return new_irq_info;
  }
  
-static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
+static int sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
  {
         struct sn_irq_info *sn_irq_info, *sn_irq_info_safe;
         nasid_t nasid;
@@ -239,6 +239,8 @@ static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
         list_for_each_entry_safe(sn_irq_info, sn_irq_info_safe,
                                  sn_irq_lh[irq], list)
                 (void)sn_retarget_vector(sn_irq_info, nasid, slice);
+
+       return 0;
  }
  
  #ifdef CONFIG_SMP
diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c

index 81e428943d7374d01e0aef8530e39c7ecd892b30..fbbfb970120128a29df68c549895187ea3e3013e 100644 (file)
--- a/arch/ia64/sn/kernel/msi_sn.c
+++ b/arch/ia64/sn/kernel/msi_sn.c
@@ -151,7 +151,7 @@ int sn_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *entry)
  }
  
  #ifdef CONFIG_SMP
-static void sn_set_msi_irq_affinity(unsigned int irq,
+static int sn_set_msi_irq_affinity(unsigned int irq,
                                     const struct cpumask *cpu_mask)
  {
         struct msi_msg msg;
@@ -168,7 +168,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
         cpu = cpumask_first(cpu_mask);
         sn_irq_info = sn_msi_info[irq].sn_irq_info;
         if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0)
-               return;
+               return -1;
  
         /*
          * Release XIO resources for the old MSI PCI address
@@ -189,7 +189,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
         new_irq_info = sn_retarget_vector(sn_irq_info, nasid, slice);
         sn_msi_info[irq].sn_irq_info = new_irq_info;
         if (new_irq_info == NULL)
-               return;
+               return -1;
  
         /*
          * Map the xio address into bus space
@@ -206,6 +206,8 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
  
         write_msi_msg(irq, &msg);
         cpumask_copy(irq_desc[irq].affinity, cpu_mask);
+
+       return 0;
  }
  #endif /* CONFIG_SMP */
  
diff --git a/arch/mips/cavium-octeon/octeon-irq.c b/arch/mips/cavium-octeon/octeon-irq.c

index 1c19af8daa62a3f992f20ecb1a281a527bfec3cd..d3a0c8154beca80cd7cb40bc3d783fa247891941 100644 (file)
--- a/arch/mips/cavium-octeon/octeon-irq.c
+++ b/arch/mips/cavium-octeon/octeon-irq.c
@@ -177,7 +177,7 @@ static void octeon_irq_ciu0_disable(unsigned int irq)
  }
  
  #ifdef CONFIG_SMP
-static void octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask *dest)
+static int octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask *dest)
  {
         int cpu;
         int bit = irq - OCTEON_IRQ_WORKQ0;      /* Bit 0-63 of EN0 */
@@ -199,6 +199,8 @@ static void octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask
          */
         cvmx_read_csr(CVMX_CIU_INTX_EN0(cvmx_get_core_num() * 2));
         write_unlock(&octeon_irq_ciu0_rwlock);
+
+       return 0;
  }
  #endif
  
@@ -292,7 +294,7 @@ static void octeon_irq_ciu1_disable(unsigned int irq)
  }
  
  #ifdef CONFIG_SMP
-static void octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask *dest)
+static int octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask *dest)
  {
         int cpu;
         int bit = irq - OCTEON_IRQ_WDOG0;       /* Bit 0-63 of EN1 */
@@ -315,6 +317,8 @@ static void octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask
          */
         cvmx_read_csr(CVMX_CIU_INTX_EN1(cvmx_get_core_num() * 2 + 1));
         write_unlock(&octeon_irq_ciu1_rwlock);
+
+       return 0;
  }
  #endif
  
diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h

index 3214ade02d105988937576c8425f444b08783b0e..4f1eed107b08217f8f2991ed17f92c424b6c437f 100644 (file)
--- a/arch/mips/include/asm/irq.h
+++ b/arch/mips/include/asm/irq.h
@@ -49,7 +49,7 @@ static inline void smtc_im_ack_irq(unsigned int irq)
  #ifdef CONFIG_MIPS_MT_SMTC_IRQAFF
  #include <linux/cpumask.h>
  
-extern void plat_set_irq_affinity(unsigned int irq,
+extern int plat_set_irq_affinity(unsigned int irq,
                                   const struct cpumask *affinity);
  extern void smtc_forward_irq(unsigned int irq);
  
diff --git a/arch/mips/kernel/irq-gic.c b/arch/mips/kernel/irq-gic.c

index 87deb8f6c45885e5c356df3ac4ddf1e58a6ac81c..3f43c2e3aa5a59ede8eab7ecdaee7762ea6afb68 100644 (file)
--- a/arch/mips/kernel/irq-gic.c
+++ b/arch/mips/kernel/irq-gic.c
@@ -155,7 +155,7 @@ static void gic_unmask_irq(unsigned int irq)
  
  static DEFINE_SPINLOCK(gic_lock);
  
-static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+static int gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
  {
         cpumask_t       tmp = CPU_MASK_NONE;
         unsigned long   flags;
@@ -166,7 +166,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
  
         cpumask_and(&tmp, cpumask, cpu_online_mask);
         if (cpus_empty(tmp))
-               return;
+               return -1;
  
         /* Assumption : cpumask refers to a single CPU */
         spin_lock_irqsave(&gic_lock, flags);
@@ -190,6 +190,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
         cpumask_copy(irq_desc[irq].affinity, cpumask);
         spin_unlock_irqrestore(&gic_lock, flags);
  
+       return 0;
  }
  #endif
  
diff --git a/arch/mips/mti-malta/malta-smtc.c b/arch/mips/mti-malta/malta-smtc.c

index 5ba31888fefbbecacd123e7033ef8ddcb6c30944..499ffe5475dff4fe8254990499d13d6f5ff4a266 100644 (file)
--- a/arch/mips/mti-malta/malta-smtc.c
+++ b/arch/mips/mti-malta/malta-smtc.c
@@ -114,7 +114,7 @@ struct plat_smp_ops msmtc_smp_ops = {
   */
  
  
-void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
+int plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
  {
         cpumask_t tmask;
         int cpu = 0;
@@ -156,5 +156,7 @@ void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
  
         /* Do any generic SMTC IRQ affinity setup */
         smtc_set_irq_affinity(irq, tmask);
+
+       return 0;
  }
  #endif /* CONFIG_MIPS_MT_SMTC_IRQAFF */
diff --git a/arch/mips/sibyte/bcm1480/irq.c b/arch/mips/sibyte/bcm1480/irq.c

index c147c4b35d3fc6f9e1f48b81866eb36d7f882594..690de06bde902f38a49b8d1e1765ae3fd2251b48 100644 (file)
--- a/arch/mips/sibyte/bcm1480/irq.c
+++ b/arch/mips/sibyte/bcm1480/irq.c
@@ -50,7 +50,7 @@ static void enable_bcm1480_irq(unsigned int irq);
  static void disable_bcm1480_irq(unsigned int irq);
  static void ack_bcm1480_irq(unsigned int irq);
  #ifdef CONFIG_SMP
-static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask);
+static int bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask);
  #endif
  
  #ifdef CONFIG_PCI
@@ -109,7 +109,7 @@ void bcm1480_unmask_irq(int cpu, int irq)
  }
  
  #ifdef CONFIG_SMP
-static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
  {
         int i = 0, old_cpu, cpu, int_on, k;
         u64 cur_ints;
@@ -118,7 +118,7 @@ static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
  
         if (cpumask_weight(mask) != 1) {
                 printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
-               return;
+               return -1;
         }
         i = cpumask_first(mask);
  
@@ -152,6 +152,8 @@ static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
                 }
         }
         spin_unlock_irqrestore(&bcm1480_imr_lock, flags);
+
+       return 0;
  }
  #endif
  
diff --git a/arch/mips/sibyte/sb1250/irq.c b/arch/mips/sibyte/sb1250/irq.c

index 38cb998ade22053b8963377b70d6f7dad7fb6b94..409dec798863194f364eb378666c9e7471fd0227 100644 (file)
--- a/arch/mips/sibyte/sb1250/irq.c
+++ b/arch/mips/sibyte/sb1250/irq.c
@@ -50,7 +50,7 @@ static void enable_sb1250_irq(unsigned int irq);
  static void disable_sb1250_irq(unsigned int irq);
  static void ack_sb1250_irq(unsigned int irq);
  #ifdef CONFIG_SMP
-static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask);
+static int sb1250_set_affinity(unsigned int irq, const struct cpumask *mask);
  #endif
  
  #ifdef CONFIG_SIBYTE_HAS_LDT
@@ -103,7 +103,7 @@ void sb1250_unmask_irq(int cpu, int irq)
  }
  
  #ifdef CONFIG_SMP
-static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
  {
         int i = 0, old_cpu, cpu, int_on;
         u64 cur_ints;
@@ -113,7 +113,7 @@ static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
  
         if (cpumask_weight(mask) > 1) {
                 printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
-               return;
+               return -1;
         }
  
         /* Convert logical CPU to physical CPU */
@@ -143,6 +143,8 @@ static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
                                         R_IMR_INTERRUPT_MASK));
         }
         spin_unlock_irqrestore(&sb1250_imr_lock, flags);
+
+       return 0;
  }
  #endif
  
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c

index 4ea4229d765ccc0657148d4f2268931ea3d6b8f9..8007f1e6572986559789a2520603a77336c72214 100644 (file)
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -130,15 +130,17 @@ int cpu_check_affinity(unsigned int irq, const struct cpumask *dest)
         return cpu_dest;
  }
  
-static void cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
+static int cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
  {
         int cpu_dest;
  
         cpu_dest = cpu_check_affinity(irq, dest);
         if (cpu_dest < 0)
-               return;
+               return -1;
  
         cpumask_copy(&irq_desc[irq].affinity, dest);
+
+       return 0;
  }
  #endif
  
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h

index b7e034b0a6ddbd4bb9d491106e08b791ef3fac22..20a44d0c9fdde3a016fb3bdab168c7d9475753da 100644 (file)
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -131,5 +131,44 @@ static inline int irqs_disabled_flags(unsigned long flags)
   */
  struct irq_chip;
  
+#ifdef CONFIG_PERF_COUNTERS
+static inline unsigned long test_perf_counter_pending(void)
+{
+       unsigned long x;
+
+       asm volatile("lbz %0,%1(13)"
+               : "=r" (x)
+               : "i" (offsetof(struct paca_struct, perf_counter_pending)));
+       return x;
+}
+
+static inline void set_perf_counter_pending(void)
+{
+       asm volatile("stb %0,%1(13)" : :
+               "r" (1),
+               "i" (offsetof(struct paca_struct, perf_counter_pending)));
+}
+
+static inline void clear_perf_counter_pending(void)
+{
+       asm volatile("stb %0,%1(13)" : :
+               "r" (0),
+               "i" (offsetof(struct paca_struct, perf_counter_pending)));
+}
+
+extern void perf_counter_do_pending(void);
+
+#else
+
+static inline unsigned long test_perf_counter_pending(void)
+{
+       return 0;
+}
+
+static inline void set_perf_counter_pending(void) {}
+static inline void clear_perf_counter_pending(void) {}
+static inline void perf_counter_do_pending(void) {}
+#endif /* CONFIG_PERF_COUNTERS */
+
  #endif /* __KERNEL__ */
  #endif /* _ASM_POWERPC_HW_IRQ_H */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h

index 082b3aedf145b4fe3d368f39829cc87911e9b62a..6ef055723019f3048cb14866b8eedddce179ba04 100644 (file)
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -99,6 +99,7 @@ struct paca_struct {
         u8 soft_enabled;                /* irq soft-enable flag */
         u8 hard_enabled;                /* set if irqs are enabled in MSR */
         u8 io_sync;                     /* writel() needs spin_unlock sync */
+       u8 perf_counter_pending;        /* PM interrupt while soft-disabled */
  
         /* Stuff for accurate time accounting */
         u64 user_time;                  /* accumulated usermode TB ticks */
diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h

new file mode 100644 (file)

index 0000000..cc7c887
--- /dev/null
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -0,0 +1,98 @@
+/*
+ * Performance counter support - PowerPC-specific definitions.
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/types.h>
+
+#define MAX_HWCOUNTERS         8
+#define MAX_EVENT_ALTERNATIVES 8
+#define MAX_LIMITED_HWCOUNTERS 2
+
+/*
+ * This struct provides the constants and functions needed to
+ * describe the PMU on a particular POWER-family CPU.
+ */
+struct power_pmu {
+       int     n_counter;
+       int     max_alternatives;
+       u64     add_fields;
+       u64     test_adder;
+       int     (*compute_mmcr)(u64 events[], int n_ev,
+                               unsigned int hwc[], u64 mmcr[]);
+       int     (*get_constraint)(u64 event, u64 *mskp, u64 *valp);
+       int     (*get_alternatives)(u64 event, unsigned int flags,
+                                   u64 alt[]);
+       void    (*disable_pmc)(unsigned int pmc, u64 mmcr[]);
+       int     (*limited_pmc_event)(u64 event);
+       u32     flags;
+       int     n_generic;
+       int     *generic_events;
+       int     (*cache_events)[PERF_COUNT_HW_CACHE_MAX]
+                              [PERF_COUNT_HW_CACHE_OP_MAX]
+                              [PERF_COUNT_HW_CACHE_RESULT_MAX];
+};
+
+extern struct power_pmu *ppmu;
+
+/*
+ * Values for power_pmu.flags
+ */
+#define PPMU_LIMITED_PMC5_6    1       /* PMC5/6 have limited function */
+#define PPMU_ALT_SIPR          2       /* uses alternate posn for SIPR/HV */
+
+/*
+ * Values for flags to get_alternatives()
+ */
+#define PPMU_LIMITED_PMC_OK    1       /* can put this on a limited PMC */
+#define PPMU_LIMITED_PMC_REQD  2       /* have to put this on a limited PMC */
+#define PPMU_ONLY_COUNT_RUN    4       /* only counting in run state */
+
+struct pt_regs;
+extern unsigned long perf_misc_flags(struct pt_regs *regs);
+#define perf_misc_flags(regs)  perf_misc_flags(regs)
+
+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+
+/*
+ * The power_pmu.get_constraint function returns a 64-bit value and
+ * a 64-bit mask that express the constraints between this event and
+ * other events.
+ *
+ * The value and mask are divided up into (non-overlapping) bitfields
+ * of three different types:
+ *
+ * Select field: this expresses the constraint that some set of bits
+ * in MMCR* needs to be set to a specific value for this event.  For a
+ * select field, the mask contains 1s in every bit of the field, and
+ * the value contains a unique value for each possible setting of the
+ * MMCR* bits.  The constraint checking code will ensure that two events
+ * that set the same field in their masks have the same value in their
+ * value dwords.
+ *
+ * Add field: this expresses the constraint that there can be at most
+ * N events in a particular class.  A field of k bits can be used for
+ * N <= 2^(k-1) - 1.  The mask has the most significant bit of the field
+ * set (and the other bits 0), and the value has only the least significant
+ * bit of the field set.  In addition, the 'add_fields' and 'test_adder'
+ * in the struct power_pmu for this processor come into play.  The
+ * add_fields value contains 1 in the LSB of the field, and the
+ * test_adder contains 2^(k-1) - 1 - N in the field.
+ *
+ * NAND field: this expresses the constraint that you may not have events
+ * in all of a set of classes.  (For example, on PPC970, you can't select
+ * events from the FPU, ISU and IDU simultaneously, although any two are
+ * possible.)  For N classes, the field is N+1 bits wide, and each class
+ * is assigned one bit from the least-significant N bits.  The mask has
+ * only the most-significant bit set, and the value has only the bit
+ * for the event's class set.  The test_adder has the least significant
+ * bit set in the field.
+ *
+ * If an event is not subject to the constraint expressed by a particular
+ * field, then it will have 0 in both the mask and value for that field.
+ */
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h

index e8018d540e87db3f3c0f5bf44b7453dc57dbe9d9..fb359b0a6937de645c8101709ac4ade7fd9df446 100644 (file)
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -492,11 +492,13 @@
  #define   MMCR0_FCHV   0x00000001UL /* freeze conditions in hypervisor mode */
  #define SPRN_MMCR1     798
  #define SPRN_MMCRA     0x312
+#define   MMCRA_SDSYNC 0x80000000UL /* SDAR synced with SIAR */
  #define   MMCRA_SIHV   0x10000000UL /* state of MSR HV when SIAR set */
  #define   MMCRA_SIPR   0x08000000UL /* state of MSR PR when SIAR set */
  #define   MMCRA_SLOT   0x07000000UL /* SLOT bits (37-39) */
  #define   MMCRA_SLOT_SHIFT     24
  #define   MMCRA_SAMPLE_ENABLE 0x00000001UL /* enable sampling */
+#define   POWER6_MMCRA_SDSYNC 0x0000080000000000ULL    /* SDAR/SIAR synced */
  #define   POWER6_MMCRA_SIHV   0x0000040000000000ULL
  #define   POWER6_MMCRA_SIPR   0x0000020000000000ULL
  #define   POWER6_MMCRA_THRM    0x00000020UL
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h

index d98a30dfd41ca2e11c8f7212097182ab6caaeaa2..a0b92de51c7edfa594941f134de592c77eb49b3e 100644 (file)
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,6 +322,6 @@ SYSCALL_SPU(epoll_create1)
  SYSCALL_SPU(dup3)
  SYSCALL_SPU(pipe2)
  SYSCALL(inotify_init1)
-SYSCALL(ni_syscall)
+SYSCALL_SPU(perf_counter_open)
  COMPAT_SYS_SPU(preadv)
  COMPAT_SYS_SPU(pwritev)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h

index 3f06f8ec81c513639e96194ca7b40262e35cf0ca..4badac2d11d1a7f3c8989ffb5be20dd4cd131a57 100644 (file)
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -341,6 +341,7 @@
  #define __NR_dup3              316
  #define __NR_pipe2             317
  #define __NR_inotify_init1     318
+#define __NR_perf_counter_open 319
  #define __NR_preadv            320
  #define __NR_pwritev           321
  
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile

index 71901fbda4a5649d737e7f37177da4fa46c3a666..a2c683403c2bea4a76d8d946cfb673b4526687dc 100644 (file)
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,6 +94,9 @@ obj64-$(CONFIG_AUDIT)         += compat_audit.o
  
  obj-$(CONFIG_DYNAMIC_FTRACE)   += ftrace.o
  obj-$(CONFIG_FUNCTION_GRAPH_TRACER)    += ftrace.o
+obj-$(CONFIG_PERF_COUNTERS)    += perf_counter.o power4-pmu.o ppc970-pmu.o \
+                                  power5-pmu.o power5+-pmu.o power6-pmu.o \
+                                  power7-pmu.o
  
  obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
  
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c

index 1e40bc0539464602451071c88109f5c9be2282fd..e981d1ce1914b20994be3d63d2e0d5acba86deda 100644 (file)
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -131,6 +131,7 @@ int main(void)
         DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
         DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
         DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
+       DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));
         DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
         DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
         DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S

index abfc3233047900ac675559928adc73b52466f75b..43e073477c34adeac616067a62b4302dbc66d6d6 100644 (file)
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -526,6 +526,15 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
  2:
         TRACE_AND_RESTORE_IRQ(r5);
  
+#ifdef CONFIG_PERF_COUNTERS
+       /* check paca->perf_counter_pending if we're enabling ints */
+       lbz     r3,PACAPERFPEND(r13)
+       and.    r3,r3,r5
+       beq     27f
+       bl      .perf_counter_do_pending
+27:
+#endif /* CONFIG_PERF_COUNTERS */
+
         /* extract EE bit and use it to restore paca->hard_enabled */
         ld      r3,_MSR(r1)
         rldicl  r4,r3,49,63             /* r0 = (r3 >> 15) & 1 */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c

index 8c1a4966867e2b40de5597b056c88a68d6d06e3b..feff792ed0f9f7e5a736e293fa91c0ec2c6d2f3b 100644 (file)
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -135,6 +135,11 @@ notrace void raw_local_irq_restore(unsigned long en)
                         iseries_handle_interrupts();
         }
  
+       if (test_perf_counter_pending()) {
+               clear_perf_counter_pending();
+               perf_counter_do_pending();
+       }
+
         /*
          * if (get_paca()->hard_enabled) return;
          * But again we need to take care that gcc gets hard_enabled directly
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c

new file mode 100644 (file)

index 0000000..bb20238
--- /dev/null
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -0,0 +1,1263 @@
+/*
+ * Performance counter support - powerpc architecture code
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_counter.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/reg.h>
+#include <asm/pmc.h>
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/ptrace.h>
+
+struct cpu_hw_counters {
+       int n_counters;
+       int n_percpu;
+       int disabled;
+       int n_added;
+       int n_limited;
+       u8  pmcs_enabled;
+       struct perf_counter *counter[MAX_HWCOUNTERS];
+       u64 events[MAX_HWCOUNTERS];
+       unsigned int flags[MAX_HWCOUNTERS];
+       u64 mmcr[3];
+       struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS];
+       u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
+};
+DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
+
+struct power_pmu *ppmu;
+
+/*
+ * Normally, to ignore kernel events we set the FCS (freeze counters
+ * in supervisor mode) bit in MMCR0, but if the kernel runs with the
+ * hypervisor bit set in the MSR, or if we are running on a processor
+ * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
+ * then we need to use the FCHV bit to ignore kernel events.
+ */
+static unsigned int freeze_counters_kernel = MMCR0_FCS;
+
+static void perf_counter_interrupt(struct pt_regs *regs);
+
+void perf_counter_print_debug(void)
+{
+}
+
+/*
+ * Read one performance monitor counter (PMC).
+ */
+static unsigned long read_pmc(int idx)
+{
+       unsigned long val;
+
+       switch (idx) {
+       case 1:
+               val = mfspr(SPRN_PMC1);
+               break;
+       case 2:
+               val = mfspr(SPRN_PMC2);
+               break;
+       case 3:
+               val = mfspr(SPRN_PMC3);
+               break;
+       case 4:
+               val = mfspr(SPRN_PMC4);
+               break;
+       case 5:
+               val = mfspr(SPRN_PMC5);
+               break;
+       case 6:
+               val = mfspr(SPRN_PMC6);
+               break;
+       case 7:
+               val = mfspr(SPRN_PMC7);
+               break;
+       case 8:
+               val = mfspr(SPRN_PMC8);
+               break;
+       default:
+               printk(KERN_ERR "oops trying to read PMC%d\n", idx);
+               val = 0;
+       }
+       return val;
+}
+
+/*
+ * Write one PMC.
+ */
+static void write_pmc(int idx, unsigned long val)
+{
+       switch (idx) {
+       case 1:
+               mtspr(SPRN_PMC1, val);
+               break;
+       case 2:
+               mtspr(SPRN_PMC2, val);
+               break;
+       case 3:
+               mtspr(SPRN_PMC3, val);
+               break;
+       case 4:
+               mtspr(SPRN_PMC4, val);
+               break;
+       case 5:
+               mtspr(SPRN_PMC5, val);
+               break;
+       case 6:
+               mtspr(SPRN_PMC6, val);
+               break;
+       case 7:
+               mtspr(SPRN_PMC7, val);
+               break;
+       case 8:
+               mtspr(SPRN_PMC8, val);
+               break;
+       default:
+               printk(KERN_ERR "oops trying to write PMC%d\n", idx);
+       }
+}
+
+/*
+ * Check if a set of events can all go on the PMU at once.
+ * If they can't, this will look at alternative codes for the events
+ * and see if any combination of alternative codes is feasible.
+ * The feasible set is returned in event[].
+ */
+static int power_check_constraints(u64 event[], unsigned int cflags[],
+                                  int n_ev)
+{
+       u64 mask, value, nv;
+       u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+       u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+       u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+       u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
+       int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
+       int i, j;
+       u64 addf = ppmu->add_fields;
+       u64 tadd = ppmu->test_adder;
+
+       if (n_ev > ppmu->n_counter)
+               return -1;
+
+       /* First see if the events will go on as-is */
+       for (i = 0; i < n_ev; ++i) {
+               if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
+                   && !ppmu->limited_pmc_event(event[i])) {
+                       ppmu->get_alternatives(event[i], cflags[i],
+                                              alternatives[i]);
+                       event[i] = alternatives[i][0];
+               }
+               if (ppmu->get_constraint(event[i], &amasks[i][0],
+                                        &avalues[i][0]))
+                       return -1;
+       }
+       value = mask = 0;
+       for (i = 0; i < n_ev; ++i) {
+               nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf);
+               if ((((nv + tadd) ^ value) & mask) != 0 ||
+                   (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0)
+                       break;
+               value = nv;
+               mask |= amasks[i][0];
+       }
+       if (i == n_ev)
+               return 0;       /* all OK */
+
+       /* doesn't work, gather alternatives... */
+       if (!ppmu->get_alternatives)
+               return -1;
+       for (i = 0; i < n_ev; ++i) {
+               choice[i] = 0;
+               n_alt[i] = ppmu->get_alternatives(event[i], cflags[i],
+                                                 alternatives[i]);
+               for (j = 1; j < n_alt[i]; ++j)
+                       ppmu->get_constraint(alternatives[i][j],
+                                            &amasks[i][j], &avalues[i][j]);
+       }
+
+       /* enumerate all possibilities and see if any will work */
+       i = 0;
+       j = -1;
+       value = mask = nv = 0;
+       while (i < n_ev) {
+               if (j >= 0) {
+                       /* we're backtracking, restore context */
+                       value = svalues[i];
+                       mask = smasks[i];
+                       j = choice[i];
+               }
+               /*
+                * See if any alternative k for event i,
+                * where k > j, will satisfy the constraints.
+                */
+               while (++j < n_alt[i]) {
+                       nv = (value | avalues[i][j]) +
+                               (value & avalues[i][j] & addf);
+                       if ((((nv + tadd) ^ value) & mask) == 0 &&
+                           (((nv + tadd) ^ avalues[i][j])
+                            & amasks[i][j]) == 0)
+                               break;
+               }
+               if (j >= n_alt[i]) {
+                       /*
+                        * No feasible alternative, backtrack
+                        * to event i-1 and continue enumerating its
+                        * alternatives from where we got up to.
+                        */
+                       if (--i < 0)
+                               return -1;
+               } else {
+                       /*
+                        * Found a feasible alternative for event i,
+                        * remember where we got up to with this event,
+                        * go on to the next event, and start with
+                        * the first alternative for it.
+                        */
+                       choice[i] = j;
+                       svalues[i] = value;
+                       smasks[i] = mask;
+                       value = nv;
+                       mask |= amasks[i][j];
+                       ++i;
+                       j = -1;
+               }
+       }
+
+       /* OK, we have a feasible combination, tell the caller the solution */
+       for (i = 0; i < n_ev; ++i)
+               event[i] = alternatives[i][choice[i]];
+       return 0;
+}
+
+/*
+ * Check if newly-added counters have consistent settings for
+ * exclude_{user,kernel,hv} with each other and any previously
+ * added counters.
+ */
+static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
+                         int n_prev, int n_new)
+{
+       int eu = 0, ek = 0, eh = 0;
+       int i, n, first;
+       struct perf_counter *counter;
+
+       n = n_prev + n_new;
+       if (n <= 1)
+               return 0;
+
+       first = 1;
+       for (i = 0; i < n; ++i) {
+               if (cflags[i] & PPMU_LIMITED_PMC_OK) {
+                       cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
+                       continue;
+               }
+               counter = ctrs[i];
+               if (first) {
+                       eu = counter->attr.exclude_user;
+                       ek = counter->attr.exclude_kernel;
+                       eh = counter->attr.exclude_hv;
+                       first = 0;
+               } else if (counter->attr.exclude_user != eu ||
+                          counter->attr.exclude_kernel != ek ||
+                          counter->attr.exclude_hv != eh) {
+                       return -EAGAIN;
+               }
+       }
+
+       if (eu || ek || eh)
+               for (i = 0; i < n; ++i)
+                       if (cflags[i] & PPMU_LIMITED_PMC_OK)
+                               cflags[i] |= PPMU_LIMITED_PMC_REQD;
+
+       return 0;
+}
+
+static void power_pmu_read(struct perf_counter *counter)
+{
+       long val, delta, prev;
+
+       if (!counter->hw.idx)
+               return;
+       /*
+        * Performance monitor interrupts come even when interrupts
+        * are soft-disabled, as long as interrupts are hard-enabled.
+        * Therefore we treat them like NMIs.
+        */
+       do {
+               prev = atomic64_read(&counter->hw.prev_count);
+               barrier();
+               val = read_pmc(counter->hw.idx);
+       } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
+
+       /* The counters are only 32 bits wide */
+       delta = (val - prev) & 0xfffffffful;
+       atomic64_add(delta, &counter->count);
+       atomic64_sub(delta, &counter->hw.period_left);
+}
+
+/*
+ * On some machines, PMC5 and PMC6 can't be written, don't respect
+ * the freeze conditions, and don't generate interrupts.  This tells
+ * us if `counter' is using such a PMC.
+ */
+static int is_limited_pmc(int pmcnum)
+{
+       return (ppmu->flags & PPMU_LIMITED_PMC5_6)
+               && (pmcnum == 5 || pmcnum == 6);
+}
+
+static void freeze_limited_counters(struct cpu_hw_counters *cpuhw,
+                                   unsigned long pmc5, unsigned long pmc6)
+{
+       struct perf_counter *counter;
+       u64 val, prev, delta;
+       int i;
+
+       for (i = 0; i < cpuhw->n_limited; ++i) {
+               counter = cpuhw->limited_counter[i];
+               if (!counter->hw.idx)
+                       continue;
+               val = (counter->hw.idx == 5) ? pmc5 : pmc6;
+               prev = atomic64_read(&counter->hw.prev_count);
+               counter->hw.idx = 0;
+               delta = (val - prev) & 0xfffffffful;
+               atomic64_add(delta, &counter->count);
+       }
+}
+
+static void thaw_limited_counters(struct cpu_hw_counters *cpuhw,
+                                 unsigned long pmc5, unsigned long pmc6)
+{
+       struct perf_counter *counter;
+       u64 val;
+       int i;
+
+       for (i = 0; i < cpuhw->n_limited; ++i) {
+               counter = cpuhw->limited_counter[i];
+               counter->hw.idx = cpuhw->limited_hwidx[i];
+               val = (counter->hw.idx == 5) ? pmc5 : pmc6;
+               atomic64_set(&counter->hw.prev_count, val);
+               perf_counter_update_userpage(counter);
+       }
+}
+
+/*
+ * Since limited counters don't respect the freeze conditions, we
+ * have to read them immediately after freezing or unfreezing the
+ * other counters.  We try to keep the values from the limited
+ * counters as consistent as possible by keeping the delay (in
+ * cycles and instructions) between freezing/unfreezing and reading
+ * the limited counters as small and consistent as possible.
+ * Therefore, if any limited counters are in use, we read them
+ * both, and always in the same order, to minimize variability,
+ * and do it inside the same asm that writes MMCR0.
+ */
+static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
+{
+       unsigned long pmc5, pmc6;
+
+       if (!cpuhw->n_limited) {
+               mtspr(SPRN_MMCR0, mmcr0);
+               return;
+       }
+
+       /*
+        * Write MMCR0, then read PMC5 and PMC6 immediately.
+        * To ensure we don't get a performance monitor interrupt
+        * between writing MMCR0 and freezing/thawing the limited
+        * counters, we first write MMCR0 with the counter overflow
+        * interrupt enable bits turned off.
+        */
+       asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
+                    : "=&r" (pmc5), "=&r" (pmc6)
+                    : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
+                      "i" (SPRN_MMCR0),
+                      "i" (SPRN_PMC5), "i" (SPRN_PMC6));
+
+       if (mmcr0 & MMCR0_FC)
+               freeze_limited_counters(cpuhw, pmc5, pmc6);
+       else
+               thaw_limited_counters(cpuhw, pmc5, pmc6);
+
+       /*
+        * Write the full MMCR0 including the counter overflow interrupt
+        * enable bits, if necessary.
+        */
+       if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
+               mtspr(SPRN_MMCR0, mmcr0);
+}
+
+/*
+ * Disable all counters to prevent PMU interrupts and to allow
+ * counters to be added or removed.
+ */
+void hw_perf_disable(void)
+{
+       struct cpu_hw_counters *cpuhw;
+       unsigned long ret;
+       unsigned long flags;
+
+       local_irq_save(flags);
+       cpuhw = &__get_cpu_var(cpu_hw_counters);
+
+       ret = cpuhw->disabled;
+       if (!ret) {
+               cpuhw->disabled = 1;
+               cpuhw->n_added = 0;
+
+               /*
+                * Check if we ever enabled the PMU on this cpu.
+                */
+               if (!cpuhw->pmcs_enabled) {
+                       if (ppc_md.enable_pmcs)
+                               ppc_md.enable_pmcs();
+                       cpuhw->pmcs_enabled = 1;
+               }
+
+               /*
+                * Disable instruction sampling if it was enabled
+                */
+               if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+                       mtspr(SPRN_MMCRA,
+                             cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+                       mb();
+               }
+
+               /*
+                * Set the 'freeze counters' bit.
+                * The barrier is to make sure the mtspr has been
+                * executed and the PMU has frozen the counters
+                * before we return.
+                */
+               write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
+               mb();
+       }
+       local_irq_restore(flags);
+}
+
+/*
+ * Re-enable all counters if disable == 0.
+ * If we were previously disabled and counters were added, then
+ * put the new config on the PMU.
+ */
+void hw_perf_enable(void)
+{
+       struct perf_counter *counter;
+       struct cpu_hw_counters *cpuhw;
+       unsigned long flags;
+       long i;
+       unsigned long val;
+       s64 left;
+       unsigned int hwc_index[MAX_HWCOUNTERS];
+       int n_lim;
+       int idx;
+
+       local_irq_save(flags);
+       cpuhw = &__get_cpu_var(cpu_hw_counters);
+       if (!cpuhw->disabled) {
+               local_irq_restore(flags);
+               return;
+       }
+       cpuhw->disabled = 0;
+
+       /*
+        * If we didn't change anything, or only removed counters,
+        * no need to recalculate MMCR* settings and reset the PMCs.
+        * Just reenable the PMU with the current MMCR* settings
+        * (possibly updated for removal of counters).
+        */
+       if (!cpuhw->n_added) {
+               mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+               mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
+               if (cpuhw->n_counters == 0)
+                       get_lppaca()->pmcregs_in_use = 0;
+               goto out_enable;
+       }
+
+       /*
+        * Compute MMCR* values for the new set of counters
+        */
+       if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
+                              cpuhw->mmcr)) {
+               /* shouldn't ever get here */
+               printk(KERN_ERR "oops compute_mmcr failed\n");
+               goto out;
+       }
+
+       /*
+        * Add in MMCR0 freeze bits corresponding to the
+        * attr.exclude_* bits for the first counter.
+        * We have already checked that all counters have the
+        * same values for these bits as the first counter.
+        */
+       counter = cpuhw->counter[0];
+       if (counter->attr.exclude_user)
+               cpuhw->mmcr[0] |= MMCR0_FCP;
+       if (counter->attr.exclude_kernel)
+               cpuhw->mmcr[0] |= freeze_counters_kernel;
+       if (counter->attr.exclude_hv)
+               cpuhw->mmcr[0] |= MMCR0_FCHV;
+
+       /*
+        * Write the new configuration to MMCR* with the freeze
+        * bit set and set the hardware counters to their initial values.
+        * Then unfreeze the counters.
+        */
+       get_lppaca()->pmcregs_in_use = 1;
+       mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+       mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
+       mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
+                               | MMCR0_FC);
+
+       /*
+        * Read off any pre-existing counters that need to move
+        * to another PMC.
+        */
+       for (i = 0; i < cpuhw->n_counters; ++i) {
+               counter = cpuhw->counter[i];
+               if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
+                       power_pmu_read(counter);
+                       write_pmc(counter->hw.idx, 0);
+                       counter->hw.idx = 0;
+               }
+       }
+
+       /*
+        * Initialize the PMCs for all the new and moved counters.
+        */
+       cpuhw->n_limited = n_lim = 0;
+       for (i = 0; i < cpuhw->n_counters; ++i) {
+               counter = cpuhw->counter[i];
+               if (counter->hw.idx)
+                       continue;
+               idx = hwc_index[i] + 1;
+               if (is_limited_pmc(idx)) {
+                       cpuhw->limited_counter[n_lim] = counter;
+                       cpuhw->limited_hwidx[n_lim] = idx;
+                       ++n_lim;
+                       continue;
+               }
+               val = 0;
+               if (counter->hw.sample_period) {
+                       left = atomic64_read(&counter->hw.period_left);
+                       if (left < 0x80000000L)
+                               val = 0x80000000L - left;
+               }
+               atomic64_set(&counter->hw.prev_count, val);
+               counter->hw.idx = idx;
+               write_pmc(idx, val);
+               perf_counter_update_userpage(counter);
+       }
+       cpuhw->n_limited = n_lim;
+       cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
+
+ out_enable:
+       mb();
+       write_mmcr0(cpuhw, cpuhw->mmcr[0]);
+
+       /*
+        * Enable instruction sampling if necessary
+        */
+       if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+               mb();
+               mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+       }
+
+ out:
+       local_irq_restore(flags);
+}
+
+static int collect_events(struct perf_counter *group, int max_count,
+                         struct perf_counter *ctrs[], u64 *events,
+                         unsigned int *flags)
+{
+       int n = 0;
+       struct perf_counter *counter;
+
+       if (!is_software_counter(group)) {
+               if (n >= max_count)
+                       return -1;
+               ctrs[n] = group;
+               flags[n] = group->hw.counter_base;
+               events[n++] = group->hw.config;
+       }
+       list_for_each_entry(counter, &group->sibling_list, list_entry) {
+               if (!is_software_counter(counter) &&
+                   counter->state != PERF_COUNTER_STATE_OFF) {
+                       if (n >= max_count)
+                               return -1;
+                       ctrs[n] = counter;
+                       flags[n] = counter->hw.counter_base;
+                       events[n++] = counter->hw.config;
+               }
+       }
+       return n;
+}
+
+static void counter_sched_in(struct perf_counter *counter, int cpu)
+{
+       counter->state = PERF_COUNTER_STATE_ACTIVE;
+       counter->oncpu = cpu;
+       counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
+       if (is_software_counter(counter))
+               counter->pmu->enable(counter);
+}
+
+/*
+ * Called to enable a whole group of counters.
+ * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
+ * Assumes the caller has disabled interrupts and has
+ * frozen the PMU with hw_perf_save_disable.
+ */
+int hw_perf_group_sched_in(struct perf_counter *group_leader,
+              struct perf_cpu_context *cpuctx,
+              struct perf_counter_context *ctx, int cpu)
+{
+       struct cpu_hw_counters *cpuhw;
+       long i, n, n0;
+       struct perf_counter *sub;
+
+       cpuhw = &__get_cpu_var(cpu_hw_counters);
+       n0 = cpuhw->n_counters;
+       n = collect_events(group_leader, ppmu->n_counter - n0,
+                          &cpuhw->counter[n0], &cpuhw->events[n0],
+                          &cpuhw->flags[n0]);
+       if (n < 0)
+               return -EAGAIN;
+       if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n))
+               return -EAGAIN;
+       i = power_check_constraints(cpuhw->events, cpuhw->flags, n + n0);
+       if (i < 0)
+               return -EAGAIN;
+       cpuhw->n_counters = n0 + n;
+       cpuhw->n_added += n;
+
+       /*
+        * OK, this group can go on; update counter states etc.,
+        * and enable any software counters
+        */
+       for (i = n0; i < n0 + n; ++i)
+               cpuhw->counter[i]->hw.config = cpuhw->events[i];
+       cpuctx->active_oncpu += n;
+       n = 1;
+       counter_sched_in(group_leader, cpu);
+       list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
+               if (sub->state != PERF_COUNTER_STATE_OFF) {
+                       counter_sched_in(sub, cpu);
+                       ++n;
+               }
+       }
+       ctx->nr_active += n;
+
+       return 1;
+}
+
+/*
+ * Add a counter to the PMU.
+ * If all counters are not already frozen, then we disable and
+ * re-enable the PMU in order to get hw_perf_enable to do the
+ * actual work of reconfiguring the PMU.
+ */
+static int power_pmu_enable(struct perf_counter *counter)
+{
+       struct cpu_hw_counters *cpuhw;
+       unsigned long flags;
+       int n0;
+       int ret = -EAGAIN;
+
+       local_irq_save(flags);
+       perf_disable();
+
+       /*
+        * Add the counter to the list (if there is room)
+        * and check whether the total set is still feasible.
+        */
+       cpuhw = &__get_cpu_var(cpu_hw_counters);
+       n0 = cpuhw->n_counters;
+       if (n0 >= ppmu->n_counter)
+               goto out;
+       cpuhw->counter[n0] = counter;
+       cpuhw->events[n0] = counter->hw.config;
+       cpuhw->flags[n0] = counter->hw.counter_base;
+       if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1))
+               goto out;
+       if (power_check_constraints(cpuhw->events, cpuhw->flags, n0 + 1))
+               goto out;
+
+       counter->hw.config = cpuhw->events[n0];
+       ++cpuhw->n_counters;
+       ++cpuhw->n_added;
+
+       ret = 0;
+ out:
+       perf_enable();
+       local_irq_restore(flags);
+       return ret;
+}
+
+/*
+ * Remove a counter from the PMU.
+ */
+static void power_pmu_disable(struct perf_counter *counter)
+{
+       struct cpu_hw_counters *cpuhw;
+       long i;
+       unsigned long flags;
+
+       local_irq_save(flags);
+       perf_disable();
+
+       power_pmu_read(counter);
+
+       cpuhw = &__get_cpu_var(cpu_hw_counters);
+       for (i = 0; i < cpuhw->n_counters; ++i) {
+               if (counter == cpuhw->counter[i]) {
+                       while (++i < cpuhw->n_counters)
+                               cpuhw->counter[i-1] = cpuhw->counter[i];
+                       --cpuhw->n_counters;
+                       ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
+                       if (counter->hw.idx) {
+                               write_pmc(counter->hw.idx, 0);
+                               counter->hw.idx = 0;
+                       }
+                       perf_counter_update_userpage(counter);
+                       break;
+               }
+       }
+       for (i = 0; i < cpuhw->n_limited; ++i)
+               if (counter == cpuhw->limited_counter[i])
+                       break;
+       if (i < cpuhw->n_limited) {
+               while (++i < cpuhw->n_limited) {
+                       cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
+                       cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
+               }
+               --cpuhw->n_limited;
+       }
+       if (cpuhw->n_counters == 0) {
+               /* disable exceptions if no counters are running */
+               cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
+       }
+
+       perf_enable();
+       local_irq_restore(flags);
+}
+
+/*
+ * Re-enable interrupts on a counter after they were throttled
+ * because they were coming too fast.
+ */
+static void power_pmu_unthrottle(struct perf_counter *counter)
+{
+       s64 val, left;
+       unsigned long flags;
+
+       if (!counter->hw.idx || !counter->hw.sample_period)
+               return;
+       local_irq_save(flags);
+       perf_disable();
+       power_pmu_read(counter);
+       left = counter->hw.sample_period;
+       counter->hw.last_period = left;
+       val = 0;
+       if (left < 0x80000000L)
+               val = 0x80000000L - left;
+       write_pmc(counter->hw.idx, val);
+       atomic64_set(&counter->hw.prev_count, val);
+       atomic64_set(&counter->hw.period_left, left);
+       perf_counter_update_userpage(counter);
+       perf_enable();
+       local_irq_restore(flags);
+}
+
+struct pmu power_pmu = {
+       .enable         = power_pmu_enable,
+       .disable        = power_pmu_disable,
+       .read           = power_pmu_read,
+       .unthrottle     = power_pmu_unthrottle,
+};
+
+/*
+ * Return 1 if we might be able to put counter on a limited PMC,
+ * or 0 if not.
+ * A counter can only go on a limited PMC if it counts something
+ * that a limited PMC can count, doesn't require interrupts, and
+ * doesn't exclude any processor mode.
+ */
+static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
+                                unsigned int flags)
+{
+       int n;
+       u64 alt[MAX_EVENT_ALTERNATIVES];
+
+       if (counter->attr.exclude_user
+           || counter->attr.exclude_kernel
+           || counter->attr.exclude_hv
+           || counter->attr.sample_period)
+               return 0;
+
+       if (ppmu->limited_pmc_event(ev))
+               return 1;
+
+       /*
+        * The requested event isn't on a limited PMC already;
+        * see if any alternative code goes on a limited PMC.
+        */
+       if (!ppmu->get_alternatives)
+               return 0;
+
+       flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
+       n = ppmu->get_alternatives(ev, flags, alt);
+
+       return n > 0;
+}
+
+/*
+ * Find an alternative event that goes on a normal PMC, if possible,
+ * and return the event code, or 0 if there is no such alternative.
+ * (Note: event code 0 is "don't count" on all machines.)
+ */
+static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
+{
+       u64 alt[MAX_EVENT_ALTERNATIVES];
+       int n;
+
+       flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
+       n = ppmu->get_alternatives(ev, flags, alt);
+       if (!n)
+               return 0;
+       return alt[0];
+}
+
+/* Number of perf_counters counting hardware events */
+static atomic_t num_counters;
+/* Used to avoid races in calling reserve/release_pmc_hardware */
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+/*
+ * Release the PMU if this is the last perf_counter.
+ */
+static void hw_perf_counter_destroy(struct perf_counter *counter)
+{
+       if (!atomic_add_unless(&num_counters, -1, 1)) {
+               mutex_lock(&pmc_reserve_mutex);
+               if (atomic_dec_return(&num_counters) == 0)
+                       release_pmc_hardware();
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+}
+
+/*
+ * Translate a generic cache event config to a raw event code.
+ */
+static int hw_perf_cache_event(u64 config, u64 *eventp)
+{
+       unsigned long type, op, result;
+       int ev;
+
+       if (!ppmu->cache_events)
+               return -EINVAL;
+
+       /* unpack config */
+       type = config & 0xff;
+       op = (config >> 8) & 0xff;
+       result = (config >> 16) & 0xff;
+
+       if (type >= PERF_COUNT_HW_CACHE_MAX ||
+           op >= PERF_COUNT_HW_CACHE_OP_MAX ||
+           result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+               return -EINVAL;
+
+       ev = (*ppmu->cache_events)[type][op][result];
+       if (ev == 0)
+               return -EOPNOTSUPP;
+       if (ev == -1)
+               return -EINVAL;
+       *eventp = ev;
+       return 0;
+}
+
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
+{
+       u64 ev;
+       unsigned long flags;
+       struct perf_counter *ctrs[MAX_HWCOUNTERS];
+       u64 events[MAX_HWCOUNTERS];
+       unsigned int cflags[MAX_HWCOUNTERS];
+       int n;
+       int err;
+
+       if (!ppmu)
+               return ERR_PTR(-ENXIO);
+       switch (counter->attr.type) {
+       case PERF_TYPE_HARDWARE:
+               ev = counter->attr.config;
+               if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
+                       return ERR_PTR(-EOPNOTSUPP);
+               ev = ppmu->generic_events[ev];
+               break;
+       case PERF_TYPE_HW_CACHE:
+               err = hw_perf_cache_event(counter->attr.config, &ev);
+               if (err)
+                       return ERR_PTR(err);
+               break;
+       case PERF_TYPE_RAW:
+               ev = counter->attr.config;
+               break;
+       }
+       counter->hw.config_base = ev;
+       counter->hw.idx = 0;
+
+       /*
+        * If we are not running on a hypervisor, force the
+        * exclude_hv bit to 0 so that we don't care what
+        * the user set it to.
+        */
+       if (!firmware_has_feature(FW_FEATURE_LPAR))
+               counter->attr.exclude_hv = 0;
+
+       /*
+        * If this is a per-task counter, then we can use
+        * PM_RUN_* events interchangeably with their non RUN_*
+        * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
+        * XXX we should check if the task is an idle task.
+        */
+       flags = 0;
+       if (counter->ctx->task)
+               flags |= PPMU_ONLY_COUNT_RUN;
+
+       /*
+        * If this machine has limited counters, check whether this
+        * event could go on a limited counter.
+        */
+       if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
+               if (can_go_on_limited_pmc(counter, ev, flags)) {
+                       flags |= PPMU_LIMITED_PMC_OK;
+               } else if (ppmu->limited_pmc_event(ev)) {
+                       /*
+                        * The requested event is on a limited PMC,
+                        * but we can't use a limited PMC; see if any
+                        * alternative goes on a normal PMC.
+                        */
+                       ev = normal_pmc_alternative(ev, flags);
+                       if (!ev)
+                               return ERR_PTR(-EINVAL);
+               }
+       }
+
+       /*
+        * If this is in a group, check if it can go on with all the
+        * other hardware counters in the group.  We assume the counter
+        * hasn't been linked into its leader's sibling list at this point.
+        */
+       n = 0;
+       if (counter->group_leader != counter) {
+               n = collect_events(counter->group_leader, ppmu->n_counter - 1,
+                                  ctrs, events, cflags);
+               if (n < 0)
+                       return ERR_PTR(-EINVAL);
+       }
+       events[n] = ev;
+       ctrs[n] = counter;
+       cflags[n] = flags;
+       if (check_excludes(ctrs, cflags, n, 1))
+               return ERR_PTR(-EINVAL);
+       if (power_check_constraints(events, cflags, n + 1))
+               return ERR_PTR(-EINVAL);
+
+       counter->hw.config = events[n];
+       counter->hw.counter_base = cflags[n];
+       counter->hw.last_period = counter->hw.sample_period;
+       atomic64_set(&counter->hw.period_left, counter->hw.last_period);
+
+       /*
+        * See if we need to reserve the PMU.
+        * If no counters are currently in use, then we have to take a
+        * mutex to ensure that we don't race with another task doing
+        * reserve_pmc_hardware or release_pmc_hardware.
+        */
+       err = 0;
+       if (!atomic_inc_not_zero(&num_counters)) {
+               mutex_lock(&pmc_reserve_mutex);
+               if (atomic_read(&num_counters) == 0 &&
+                   reserve_pmc_hardware(perf_counter_interrupt))
+                       err = -EBUSY;
+               else
+                       atomic_inc(&num_counters);
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+       counter->destroy = hw_perf_counter_destroy;
+
+       if (err)
+               return ERR_PTR(err);
+       return &power_pmu;
+}
+
+/*
+ * A counter has overflowed; update its count and record
+ * things if requested.  Note that interrupts are hard-disabled
+ * here so there is no possibility of being interrupted.
+ */
+static void record_and_restart(struct perf_counter *counter, long val,
+                              struct pt_regs *regs, int nmi)
+{
+       u64 period = counter->hw.sample_period;
+       s64 prev, delta, left;
+       int record = 0;
+       u64 addr, mmcra, sdsync;
+
+       /* we don't have to worry about interrupts here */
+       prev = atomic64_read(&counter->hw.prev_count);
+       delta = (val - prev) & 0xfffffffful;
+       atomic64_add(delta, &counter->count);
+
+       /*
+        * See if the total period for this counter has expired,
+        * and update for the next period.
+        */
+       val = 0;
+       left = atomic64_read(&counter->hw.period_left) - delta;
+       if (period) {
+               if (left <= 0) {
+                       left += period;
+                       if (left <= 0)
+                               left = period;
+                       record = 1;
+               }
+               if (left < 0x80000000L)
+                       val = 0x80000000L - left;
+       }
+
+       /*
+        * Finally record data if requested.
+        */
+       if (record) {
+               struct perf_sample_data data = {
+                       .regs   = regs,
+                       .addr   = 0,
+                       .period = counter->hw.last_period,
+               };
+
+               if (counter->attr.sample_type & PERF_SAMPLE_ADDR) {
+                       /*
+                        * The user wants a data address recorded.
+                        * If we're not doing instruction sampling,
+                        * give them the SDAR (sampled data address).
+                        * If we are doing instruction sampling, then only
+                        * give them the SDAR if it corresponds to the
+                        * instruction pointed to by SIAR; this is indicated
+                        * by the [POWER6_]MMCRA_SDSYNC bit in MMCRA.
+                        */
+                       mmcra = regs->dsisr;
+                       sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
+                               POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
+                       if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
+                               data.addr = mfspr(SPRN_SDAR);
+               }
+               if (perf_counter_overflow(counter, nmi, &data)) {
+                       /*
+                        * Interrupts are coming too fast - throttle them
+                        * by setting the counter to 0, so it will be
+                        * at least 2^30 cycles until the next interrupt
+                        * (assuming each counter counts at most 2 counts
+                        * per cycle).
+                        */
+                       val = 0;
+                       left = ~0ULL >> 1;
+               }
+       }
+
+       write_pmc(counter->hw.idx, val);
+       atomic64_set(&counter->hw.prev_count, val);
+       atomic64_set(&counter->hw.period_left, left);
+       perf_counter_update_userpage(counter);
+}
+
+/*
+ * Called from generic code to get the misc flags (i.e. processor mode)
+ * for an event.
+ */
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+       unsigned long mmcra;
+
+       if (TRAP(regs) != 0xf00) {
+               /* not a PMU interrupt */
+               return user_mode(regs) ? PERF_EVENT_MISC_USER :
+                       PERF_EVENT_MISC_KERNEL;
+       }
+
+       mmcra = regs->dsisr;
+       if (ppmu->flags & PPMU_ALT_SIPR) {
+               if (mmcra & POWER6_MMCRA_SIHV)
+                       return PERF_EVENT_MISC_HYPERVISOR;
+               return (mmcra & POWER6_MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
+                       PERF_EVENT_MISC_KERNEL;
+       }
+       if (mmcra & MMCRA_SIHV)
+               return PERF_EVENT_MISC_HYPERVISOR;
+       return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
+                       PERF_EVENT_MISC_KERNEL;
+}
+
+/*
+ * Called from generic code to get the instruction pointer
+ * for an event.
+ */
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+       unsigned long mmcra;
+       unsigned long ip;
+       unsigned long slot;
+
+       if (TRAP(regs) != 0xf00)
+               return regs->nip;       /* not a PMU interrupt */
+
+       ip = mfspr(SPRN_SIAR);
+       mmcra = regs->dsisr;
+       if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
+               slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
+               if (slot > 1)
+                       ip += 4 * (slot - 1);
+       }
+       return ip;
+}
+
+/*
+ * Performance monitor interrupt stuff
+ */
+static void perf_counter_interrupt(struct pt_regs *regs)
+{
+       int i;
+       struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
+       struct perf_counter *counter;
+       long val;
+       int found = 0;
+       int nmi;
+
+       if (cpuhw->n_limited)
+               freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
+                                       mfspr(SPRN_PMC6));
+
+       /*
+        * Overload regs->dsisr to store MMCRA so we only need to read it once.
+        */
+       regs->dsisr = mfspr(SPRN_MMCRA);
+
+       /*
+        * If interrupts were soft-disabled when this PMU interrupt
+        * occurred, treat it as an NMI.
+        */
+       nmi = !regs->softe;
+       if (nmi)
+               nmi_enter();
+       else
+               irq_enter();
+
+       for (i = 0; i < cpuhw->n_counters; ++i) {
+               counter = cpuhw->counter[i];
+               if (!counter->hw.idx || is_limited_pmc(counter->hw.idx))
+                       continue;
+               val = read_pmc(counter->hw.idx);
+               if ((int)val < 0) {
+                       /* counter has overflowed */
+                       found = 1;
+                       record_and_restart(counter, val, regs, nmi);
+               }
+       }
+
+       /*
+        * In case we didn't find and reset the counter that caused
+        * the interrupt, scan all counters and reset any that are
+        * negative, to avoid getting continual interrupts.
+        * Any that we processed in the previous loop will not be negative.
+        */
+       if (!found) {
+               for (i = 0; i < ppmu->n_counter; ++i) {
+                       if (is_limited_pmc(i + 1))
+                               continue;
+                       val = read_pmc(i + 1);
+                       if ((int)val < 0)
+                               write_pmc(i + 1, 0);
+               }
+       }
+
+       /*
+        * Reset MMCR0 to its normal value.  This will set PMXE and
+        * clear FC (freeze counters) and PMAO (perf mon alert occurred)
+        * and thus allow interrupts to occur again.
+        * XXX might want to use MSR.PM to keep the counters frozen until
+        * we get back out of this interrupt.
+        */
+       write_mmcr0(cpuhw, cpuhw->mmcr[0]);
+
+       if (nmi)
+               nmi_exit();
+       else
+               irq_exit();
+}
+
+void hw_perf_counter_setup(int cpu)
+{
+       struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
+
+       memset(cpuhw, 0, sizeof(*cpuhw));
+       cpuhw->mmcr[0] = MMCR0_FC;
+}
+
+extern struct power_pmu power4_pmu;
+extern struct power_pmu ppc970_pmu;
+extern struct power_pmu power5_pmu;
+extern struct power_pmu power5p_pmu;
+extern struct power_pmu power6_pmu;
+extern struct power_pmu power7_pmu;
+
+static int init_perf_counters(void)
+{
+       unsigned long pvr;
+
+       /* XXX should get this from cputable */
+       pvr = mfspr(SPRN_PVR);
+       switch (PVR_VER(pvr)) {
+       case PV_POWER4:
+       case PV_POWER4p:
+               ppmu = &power4_pmu;
+               break;
+       case PV_970:
+       case PV_970FX:
+       case PV_970MP:
+               ppmu = &ppc970_pmu;
+               break;
+       case PV_POWER5:
+               ppmu = &power5_pmu;
+               break;
+       case PV_POWER5p:
+               ppmu = &power5p_pmu;
+               break;
+       case 0x3e:
+               ppmu = &power6_pmu;
+               break;
+       case 0x3f:
+               ppmu = &power7_pmu;
+               break;
+       }
+
+       /*
+        * Use FCHV to ignore kernel events if MSR.HV is set.
+        */
+       if (mfmsr() & MSR_HV)
+               freeze_counters_kernel = MMCR0_FCHV;
+
+       return 0;
+}
+
+arch_initcall(init_perf_counters);
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c

new file mode 100644 (file)

index 0000000..07bd308
--- /dev/null
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -0,0 +1,598 @@
+/*
+ * Performance counter support for POWER4 (GP) and POWER4+ (GQ) processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER4
+ */
+#define PM_PMC_SH      12      /* PMC number (1-based) for direct events */
+#define PM_PMC_MSK     0xf
+#define PM_UNIT_SH     8       /* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK    0xf
+#define PM_LOWER_SH    6
+#define PM_LOWER_MSK   1
+#define PM_LOWER_MSKS  0x40
+#define PM_BYTE_SH     4       /* Byte number of event bus to use */
+#define PM_BYTE_MSK    3
+#define PM_PMCSEL_MSK  7
+
+/*
+ * Unit code values
+ */
+#define PM_FPU         1
+#define PM_ISU1                2
+#define PM_IFU         3
+#define PM_IDU0                4
+#define PM_ISU1_ALT    6
+#define PM_ISU2                7
+#define PM_IFU_ALT     8
+#define PM_LSU0                9
+#define PM_LSU1                0xc
+#define PM_GPS         0xf
+
+/*
+ * Bits in MMCR0 for POWER4
+ */
+#define MMCR0_PMC1SEL_SH       8
+#define MMCR0_PMC2SEL_SH       1
+#define MMCR_PMCSEL_MSK                0x1f
+
+/*
+ * Bits in MMCR1 for POWER4
+ */
+#define MMCR1_TTM0SEL_SH       62
+#define MMCR1_TTC0SEL_SH       61
+#define MMCR1_TTM1SEL_SH       59
+#define MMCR1_TTC1SEL_SH       58
+#define MMCR1_TTM2SEL_SH       56
+#define MMCR1_TTC2SEL_SH       55
+#define MMCR1_TTM3SEL_SH       53
+#define MMCR1_TTC3SEL_SH       52
+#define MMCR1_TTMSEL_MSK       3
+#define MMCR1_TD_CP_DBG0SEL_SH 50
+#define MMCR1_TD_CP_DBG1SEL_SH 48
+#define MMCR1_TD_CP_DBG2SEL_SH 46
+#define MMCR1_TD_CP_DBG3SEL_SH 44
+#define MMCR1_DEBUG0SEL_SH     43
+#define MMCR1_DEBUG1SEL_SH     42
+#define MMCR1_DEBUG2SEL_SH     41
+#define MMCR1_DEBUG3SEL_SH     40
+#define MMCR1_PMC1_ADDER_SEL_SH        39
+#define MMCR1_PMC2_ADDER_SEL_SH        38
+#define MMCR1_PMC6_ADDER_SEL_SH        37
+#define MMCR1_PMC5_ADDER_SEL_SH        36
+#define MMCR1_PMC8_ADDER_SEL_SH        35
+#define MMCR1_PMC7_ADDER_SEL_SH        34
+#define MMCR1_PMC3_ADDER_SEL_SH        33
+#define MMCR1_PMC4_ADDER_SEL_SH        32
+#define MMCR1_PMC3SEL_SH       27
+#define MMCR1_PMC4SEL_SH       22
+#define MMCR1_PMC5SEL_SH       17
+#define MMCR1_PMC6SEL_SH       12
+#define MMCR1_PMC7SEL_SH       7
+#define MMCR1_PMC8SEL_SH       2       /* note bit 0 is in MMCRA for GP */
+
+static short mmcr1_adder_bits[8] = {
+       MMCR1_PMC1_ADDER_SEL_SH,
+       MMCR1_PMC2_ADDER_SEL_SH,
+       MMCR1_PMC3_ADDER_SEL_SH,
+       MMCR1_PMC4_ADDER_SEL_SH,
+       MMCR1_PMC5_ADDER_SEL_SH,
+       MMCR1_PMC6_ADDER_SEL_SH,
+       MMCR1_PMC7_ADDER_SEL_SH,
+       MMCR1_PMC8_ADDER_SEL_SH
+};
+
+/*
+ * Bits in MMCRA
+ */
+#define MMCRA_PMC8SEL0_SH      17      /* PMC8SEL bit 0 for GP */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *        |[  >[  >[   >|||[  >[  ><  ><  ><  ><  ><><><><><><><><>
+ *        | UC1 UC2 UC3 ||| PS1 PS2 B0  B1  B2  B3 P1P2P3P4P5P6P7P8
+ *       \SMPL         ||\TTC3SEL
+ *                     |\TTC_IFU_SEL
+ *                     \TTM2SEL0
+ *
+ * SMPL - SAMPLE_ENABLE constraint
+ *     56: SAMPLE_ENABLE value 0x0100_0000_0000_0000
+ *
+ * UC1 - unit constraint 1: can't have all three of FPU/ISU1/IDU0|ISU2
+ *     55: UC1 error 0x0080_0000_0000_0000
+ *     54: FPU events needed 0x0040_0000_0000_0000
+ *     53: ISU1 events needed 0x0020_0000_0000_0000
+ *     52: IDU0|ISU2 events needed 0x0010_0000_0000_0000
+ *
+ * UC2 - unit constraint 2: can't have all three of FPU/IFU/LSU0
+ *     51: UC2 error 0x0008_0000_0000_0000
+ *     50: FPU events needed 0x0004_0000_0000_0000
+ *     49: IFU events needed 0x0002_0000_0000_0000
+ *     48: LSU0 events needed 0x0001_0000_0000_0000
+ *
+ * UC3 - unit constraint 3: can't have all four of LSU0/IFU/IDU0|ISU2/ISU1
+ *     47: UC3 error 0x8000_0000_0000
+ *     46: LSU0 events needed 0x4000_0000_0000
+ *     45: IFU events needed 0x2000_0000_0000
+ *     44: IDU0|ISU2 events needed 0x1000_0000_0000
+ *     43: ISU1 events needed 0x0800_0000_0000
+ *
+ * TTM2SEL0
+ *     42: 0 = IDU0 events needed
+ *                1 = ISU2 events needed 0x0400_0000_0000
+ *
+ * TTC_IFU_SEL
+ *     41: 0 = IFU.U events needed
+ *                1 = IFU.L events needed 0x0200_0000_0000
+ *
+ * TTC3SEL
+ *     40: 0 = LSU1.U events needed
+ *                1 = LSU1.L events needed 0x0100_0000_0000
+ *
+ * PS1
+ *     39: PS1 error 0x0080_0000_0000
+ *     36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
+ *
+ * PS2
+ *     35: PS2 error 0x0008_0000_0000
+ *     32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
+ *
+ * B0
+ *     28-31: Byte 0 event source 0xf000_0000
+ *                1 = FPU
+ *        2 = ISU1
+ *        3 = IFU
+ *        4 = IDU0
+ *        7 = ISU2
+ *        9 = LSU0
+ *        c = LSU1
+ *        f = GPS
+ *
+ * B1, B2, B3
+ *     24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
+ *
+ * P8
+ *     15: P8 error 0x8000
+ *     14-15: Count of events needing PMC8
+ *
+ * P1..P7
+ *     0-13: Count of events needing PMC1..PMC7
+ *
+ * Note: this doesn't allow events using IFU.U to be combined with events
+ * using IFU.L, though that is feasible (using TTM0 and TTM2).  However
+ * there are no listed events for IFU.L (they are debug events not
+ * verified for performance monitoring) so this shouldn't cause a
+ * problem.
+ */
+
+static struct unitinfo {
+       u64     value, mask;
+       int     unit;
+       int     lowerbit;
+} p4_unitinfo[16] = {
+       [PM_FPU]  = { 0x44000000000000ull, 0x88000000000000ull, PM_FPU, 0 },
+       [PM_ISU1] = { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
+       [PM_ISU1_ALT] =
+                   { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
+       [PM_IFU]  = { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
+       [PM_IFU_ALT] =
+                   { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
+       [PM_IDU0] = { 0x10100000000000ull, 0x80840000000000ull, PM_IDU0, 1 },
+       [PM_ISU2] = { 0x10140000000000ull, 0x80840000000000ull, PM_ISU2, 0 },
+       [PM_LSU0] = { 0x01400000000000ull, 0x08800000000000ull, PM_LSU0, 0 },
+       [PM_LSU1] = { 0x00000000000000ull, 0x00010000000000ull, PM_LSU1, 40 },
+       [PM_GPS]  = { 0x00000000000000ull, 0x00000000000000ull, PM_GPS, 0 }
+};
+
+static unsigned char direct_marked_event[8] = {
+       (1<<2) | (1<<3),        /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
+       (1<<3) | (1<<5),        /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
+       (1<<3),                 /* PMC3: PM_MRK_ST_CMPL_INT */
+       (1<<4) | (1<<5),        /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
+       (1<<4) | (1<<5),        /* PMC5: PM_MRK_GRP_TIMEO */
+       (1<<3) | (1<<4) | (1<<5),
+               /* PMC6: PM_MRK_ST_GPS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
+       (1<<4) | (1<<5),        /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
+       (1<<4),                 /* PMC8: PM_MRK_LSU_FIN */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int p4_marked_instr_event(u64 event)
+{
+       int pmc, psel, unit, byte, bit;
+       unsigned int mask;
+
+       pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+       psel = event & PM_PMCSEL_MSK;
+       if (pmc) {
+               if (direct_marked_event[pmc - 1] & (1 << psel))
+                       return 1;
+               if (psel == 0)          /* add events */
+                       bit = (pmc <= 4)? pmc - 1: 8 - pmc;
+               else if (psel == 6)     /* decode events */
+                       bit = 4;
+               else
+                       return 0;
+       } else
+               bit = psel;
+
+       byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+       unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+       mask = 0;
+       switch (unit) {
+       case PM_LSU1:
+               if (event & PM_LOWER_MSKS)
+                       mask = 1 << 28;         /* byte 7 bit 4 */
+               else
+                       mask = 6 << 24;         /* byte 3 bits 1 and 2 */
+               break;
+       case PM_LSU0:
+               /* byte 3, bit 3; byte 2 bits 0,2,3,4,5; byte 1 */
+               mask = 0x083dff00;
+       }
+       return (mask >> (byte * 8 + bit)) & 1;
+}
+
+static int p4_get_constraint(u64 event, u64 *maskp, u64 *valp)
+{
+       int pmc, byte, unit, lower, sh;
+       u64 mask = 0, value = 0;
+       int grp = -1;
+
+       pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+       if (pmc) {
+               if (pmc > 8)
+                       return -1;
+               sh = (pmc - 1) * 2;
+               mask |= 2 << sh;
+               value |= 1 << sh;
+               grp = ((pmc - 1) >> 1) & 1;
+       }
+       unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+       byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+       if (unit) {
+               lower = (event >> PM_LOWER_SH) & PM_LOWER_MSK;
+
+               /*
+                * Bus events on bytes 0 and 2 can be counted
+                * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
+                */
+               if (!pmc)
+                       grp = byte & 1;
+
+               if (!p4_unitinfo[unit].unit)
+                       return -1;
+               mask  |= p4_unitinfo[unit].mask;
+               value |= p4_unitinfo[unit].value;
+               sh = p4_unitinfo[unit].lowerbit;
+               if (sh > 1)
+                       value |= (u64)lower << sh;
+               else if (lower != sh)
+                       return -1;
+               unit = p4_unitinfo[unit].unit;
+
+               /* Set byte lane select field */
+               mask  |= 0xfULL << (28 - 4 * byte);
+               value |= (u64)unit << (28 - 4 * byte);
+       }
+       if (grp == 0) {
+               /* increment PMC1/2/5/6 field */
+               mask  |= 0x8000000000ull;
+               value |= 0x1000000000ull;
+       } else {
+               /* increment PMC3/4/7/8 field */
+               mask  |= 0x800000000ull;
+               value |= 0x100000000ull;
+       }
+
+       /* Marked instruction events need sample_enable set */
+       if (p4_marked_instr_event(event)) {
+               mask  |= 1ull << 56;
+               value |= 1ull << 56;
+       }
+
+       /* PMCSEL=6 decode events on byte 2 need sample_enable clear */
+       if (pmc && (event & PM_PMCSEL_MSK) == 6 && byte == 2)
+               mask  |= 1ull << 56;
+
+       *maskp = mask;
+       *valp = value;
+       return 0;
+}
+
+static unsigned int ppc_inst_cmpl[] = {
+       0x1001, 0x4001, 0x6001, 0x7001, 0x8001
+};
+
+static int p4_get_alternatives(u64 event, unsigned int flags, u64 alt[])
+{
+       int i, j, na;
+
+       alt[0] = event;
+       na = 1;
+
+       /* 2 possibilities for PM_GRP_DISP_REJECT */
+       if (event == 0x8003 || event == 0x0224) {
+               alt[1] = event ^ (0x8003 ^ 0x0224);
+               return 2;
+       }
+
+       /* 2 possibilities for PM_ST_MISS_L1 */
+       if (event == 0x0c13 || event == 0x0c23) {
+               alt[1] = event ^ (0x0c13 ^ 0x0c23);
+               return 2;
+       }
+
+       /* several possibilities for PM_INST_CMPL */
+       for (i = 0; i < ARRAY_SIZE(ppc_inst_cmpl); ++i) {
+               if (event == ppc_inst_cmpl[i]) {
+                       for (j = 0; j < ARRAY_SIZE(ppc_inst_cmpl); ++j)
+                               if (j != i)
+                                       alt[na++] = ppc_inst_cmpl[j];
+                       break;
+               }
+       }
+
+       return na;
+}
+
+static int p4_compute_mmcr(u64 event[], int n_ev,
+                          unsigned int hwc[], u64 mmcr[])
+{
+       u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
+       unsigned int pmc, unit, byte, psel, lower;
+       unsigned int ttm, grp;
+       unsigned int pmc_inuse = 0;
+       unsigned int pmc_grp_use[2];
+       unsigned char busbyte[4];
+       unsigned char unituse[16];
+       unsigned int unitlower = 0;
+       int i;
+
+       if (n_ev > 8)
+               return -1;
+
+       /* First pass to count resource use */
+       pmc_grp_use[0] = pmc_grp_use[1] = 0;
+       memset(busbyte, 0, sizeof(busbyte));
+       memset(unituse, 0, sizeof(unituse));
+       for (i = 0; i < n_ev; ++i) {
+               pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+               if (pmc) {
+                       if (pmc_inuse & (1 << (pmc - 1)))
+                               return -1;
+                       pmc_inuse |= 1 << (pmc - 1);
+                       /* count 1/2/5/6 vs 3/4/7/8 use */
+                       ++pmc_grp_use[((pmc - 1) >> 1) & 1];
+               }
+               unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+               byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+               lower = (event[i] >> PM_LOWER_SH) & PM_LOWER_MSK;
+               if (unit) {
+                       if (!pmc)
+                               ++pmc_grp_use[byte & 1];
+                       if (unit == 6 || unit == 8)
+                               /* map alt ISU1/IFU codes: 6->2, 8->3 */
+                               unit = (unit >> 1) - 1;
+                       if (busbyte[byte] && busbyte[byte] != unit)
+                               return -1;
+                       busbyte[byte] = unit;
+                       lower <<= unit;
+                       if (unituse[unit] && lower != (unitlower & lower))
+                               return -1;
+                       unituse[unit] = 1;
+                       unitlower |= lower;
+               }
+       }
+       if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
+               return -1;
+
+       /*
+        * Assign resources and set multiplexer selects.
+        *
+        * Units 1,2,3 are on TTM0, 4,6,7 on TTM1, 8,10 on TTM2.
+        * Each TTMx can only select one unit, but since
+        * units 2 and 6 are both ISU1, and 3 and 8 are both IFU,
+        * we have some choices.
+        */
+       if (unituse[2] & (unituse[1] | (unituse[3] & unituse[9]))) {
+               unituse[6] = 1;         /* Move 2 to 6 */
+               unituse[2] = 0;
+       }
+       if (unituse[3] & (unituse[1] | unituse[2])) {
+               unituse[8] = 1;         /* Move 3 to 8 */
+               unituse[3] = 0;
+               unitlower = (unitlower & ~8) | ((unitlower & 8) << 5);
+       }
+       /* Check only one unit per TTMx */
+       if (unituse[1] + unituse[2] + unituse[3] > 1 ||
+           unituse[4] + unituse[6] + unituse[7] > 1 ||
+           unituse[8] + unituse[9] > 1 ||
+           (unituse[5] | unituse[10] | unituse[11] |
+            unituse[13] | unituse[14]))
+               return -1;
+
+       /* Set TTMxSEL fields.  Note, units 1-3 => TTM0SEL codes 0-2 */
+       mmcr1 |= (u64)(unituse[3] * 2 + unituse[2]) << MMCR1_TTM0SEL_SH;
+       mmcr1 |= (u64)(unituse[7] * 3 + unituse[6] * 2) << MMCR1_TTM1SEL_SH;
+       mmcr1 |= (u64)unituse[9] << MMCR1_TTM2SEL_SH;
+
+       /* Set TTCxSEL fields. */
+       if (unitlower & 0xe)
+               mmcr1 |= 1ull << MMCR1_TTC0SEL_SH;
+       if (unitlower & 0xf0)
+               mmcr1 |= 1ull << MMCR1_TTC1SEL_SH;
+       if (unitlower & 0xf00)
+               mmcr1 |= 1ull << MMCR1_TTC2SEL_SH;
+       if (unitlower & 0x7000)
+               mmcr1 |= 1ull << MMCR1_TTC3SEL_SH;
+
+       /* Set byte lane select fields. */
+       for (byte = 0; byte < 4; ++byte) {
+               unit = busbyte[byte];
+               if (!unit)
+                       continue;
+               if (unit == 0xf) {
+                       /* special case for GPS */
+                       mmcr1 |= 1ull << (MMCR1_DEBUG0SEL_SH - byte);
+               } else {
+                       if (!unituse[unit])
+                               ttm = unit - 1;         /* 2->1, 3->2 */
+                       else
+                               ttm = unit >> 2;
+                       mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2*byte);
+               }
+       }
+
+       /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+       for (i = 0; i < n_ev; ++i) {
+               pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+               unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+               byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+               psel = event[i] & PM_PMCSEL_MSK;
+               if (!pmc) {
+                       /* Bus event or 00xxx direct event (off or cycles) */
+                       if (unit)
+                               psel |= 0x10 | ((byte & 2) << 2);
+                       for (pmc = 0; pmc < 8; ++pmc) {
+                               if (pmc_inuse & (1 << pmc))
+                                       continue;
+                               grp = (pmc >> 1) & 1;
+                               if (unit) {
+                                       if (grp == (byte & 1))
+                                               break;
+                               } else if (pmc_grp_use[grp] < 4) {
+                                       ++pmc_grp_use[grp];
+                                       break;
+                               }
+                       }
+                       pmc_inuse |= 1 << pmc;
+               } else {
+                       /* Direct event */
+                       --pmc;
+                       if (psel == 0 && (byte & 2))
+                               /* add events on higher-numbered bus */
+                               mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
+                       else if (psel == 6 && byte == 3)
+                               /* seem to need to set sample_enable here */
+                               mmcra |= MMCRA_SAMPLE_ENABLE;
+                       psel |= 8;
+               }
+               if (pmc <= 1)
+                       mmcr0 |= psel << (MMCR0_PMC1SEL_SH - 7 * pmc);
+               else
+                       mmcr1 |= psel << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
+               if (pmc == 7)   /* PMC8 */
+                       mmcra |= (psel & 1) << MMCRA_PMC8SEL0_SH;
+               hwc[i] = pmc;
+               if (p4_marked_instr_event(event[i]))
+                       mmcra |= MMCRA_SAMPLE_ENABLE;
+       }
+
+       if (pmc_inuse & 1)
+               mmcr0 |= MMCR0_PMC1CE;
+       if (pmc_inuse & 0xfe)
+               mmcr0 |= MMCR0_PMCjCE;
+
+       mmcra |= 0x2000;        /* mark only one IOP per PPC instruction */
+
+       /* Return MMCRx values */
+       mmcr[0] = mmcr0;
+       mmcr[1] = mmcr1;
+       mmcr[2] = mmcra;
+       return 0;
+}
+
+static void p4_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+       /*
+        * Setting the PMCxSEL field to 0 disables PMC x.
+        * (Note that pmc is 0-based here, not 1-based.)
+        */
+       if (pmc <= 1) {
+               mmcr[0] &= ~(0x1fUL << (MMCR0_PMC1SEL_SH - 7 * pmc));
+       } else {
+               mmcr[1] &= ~(0x1fUL << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)));
+               if (pmc == 7)
+                       mmcr[2] &= ~(1UL << MMCRA_PMC8SEL0_SH);
+       }
+}
+
+static int p4_generic_events[] = {
+       [PERF_COUNT_HW_CPU_CYCLES]              = 7,
+       [PERF_COUNT_HW_INSTRUCTIONS]            = 0x1001,
+       [PERF_COUNT_HW_CACHE_REFERENCES]        = 0x8c10, /* PM_LD_REF_L1 */
+       [PERF_COUNT_HW_CACHE_MISSES]            = 0x3c10, /* PM_LD_MISS_L1 */
+       [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]     = 0x330,  /* PM_BR_ISSUED */
+       [PERF_COUNT_HW_BRANCH_MISSES]           = 0x331,  /* PM_BR_MPRED_CR */
+};
+
+#define C(x)   PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+       [C(L1D)] = {            /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0x8c10,         0x3c10  },
+               [C(OP_WRITE)] = {       0x7c10,         0xc13   },
+               [C(OP_PREFETCH)] = {    0xc35,          0       },
+       },
+       [C(L1I)] = {            /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0       },
+               [C(OP_WRITE)] = {       -1,             -1      },
+               [C(OP_PREFETCH)] = {    0,              0       },
+       },
+       [C(LL)] = {             /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0       },
+               [C(OP_WRITE)] = {       0,              0       },
+               [C(OP_PREFETCH)] = {    0xc34,          0       },
+       },
+       [C(DTLB)] = {           /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0x904   },
+               [C(OP_WRITE)] = {       -1,             -1      },
+               [C(OP_PREFETCH)] = {    -1,             -1      },
+       },
+       [C(ITLB)] = {           /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0x900   },
+               [C(OP_WRITE)] = {       -1,             -1      },
+               [C(OP_PREFETCH)] = {    -1,             -1      },
+       },
+       [C(BPU)] = {            /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0x330,          0x331   },
+               [C(OP_WRITE)] = {       -1,             -1      },
+               [C(OP_PREFETCH)] = {    -1,             -1      },
+       },
+};
+
+struct power_pmu power4_pmu = {
+       .n_counter = 8,
+       .max_alternatives = 5,
+       .add_fields = 0x0000001100005555ull,
+       .test_adder = 0x0011083300000000ull,
+       .compute_mmcr = p4_compute_mmcr,
+       .get_constraint = p4_get_constraint,
+       .get_alternatives = p4_get_alternatives,
+       .disable_pmc = p4_disable_pmc,
+       .n_generic = ARRAY_SIZE(p4_generic_events),
+       .generic_events = p4_generic_events,
+       .cache_events = &power4_cache_events,
+};
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c

new file mode 100644 (file)

index 0000000..41e5d2d
--- /dev/null
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -0,0 +1,671 @@
+/*
+ * Performance counter support for POWER5+/++ (not POWER5) processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3)
+ */
+#define PM_PMC_SH      20      /* PMC number (1-based) for direct events */
+#define PM_PMC_MSK     0xf
+#define PM_PMC_MSKS    (PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH     16      /* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK    0xf
+#define PM_BYTE_SH     12      /* Byte number of event bus to use */
+#define PM_BYTE_MSK    7
+#define PM_GRS_SH      8       /* Storage subsystem mux select */
+#define PM_GRS_MSK     7
+#define PM_BUSEVENT_MSK        0x80    /* Set if event uses event bus */
+#define PM_PMCSEL_MSK  0x7f
+
+/* Values in PM_UNIT field */
+#define PM_FPU         0
+#define PM_ISU0                1
+#define PM_IFU         2
+#define PM_ISU1                3
+#define PM_IDU         4
+#define PM_ISU0_ALT    6
+#define PM_GRS         7
+#define PM_LSU0                8
+#define PM_LSU1                0xc
+#define PM_LASTUNIT    0xc
+
+/*
+ * Bits in MMCR1 for POWER5+
+ */
+#define MMCR1_TTM0SEL_SH       62
+#define MMCR1_TTM1SEL_SH       60
+#define MMCR1_TTM2SEL_SH       58
+#define MMCR1_TTM3SEL_SH       56
+#define MMCR1_TTMSEL_MSK       3
+#define MMCR1_TD_CP_DBG0SEL_SH 54
+#define MMCR1_TD_CP_DBG1SEL_SH 52
+#define MMCR1_TD_CP_DBG2SEL_SH 50
+#define MMCR1_TD_CP_DBG3SEL_SH 48
+#define MMCR1_GRS_L2SEL_SH     46
+#define MMCR1_GRS_L2SEL_MSK    3
+#define MMCR1_GRS_L3SEL_SH     44
+#define MMCR1_GRS_L3SEL_MSK    3
+#define MMCR1_GRS_MCSEL_SH     41
+#define MMCR1_GRS_MCSEL_MSK    7
+#define MMCR1_GRS_FABSEL_SH    39
+#define MMCR1_GRS_FABSEL_MSK   3
+#define MMCR1_PMC1_ADDER_SEL_SH        35
+#define MMCR1_PMC2_ADDER_SEL_SH        34
+#define MMCR1_PMC3_ADDER_SEL_SH        33
+#define MMCR1_PMC4_ADDER_SEL_SH        32
+#define MMCR1_PMC1SEL_SH       25
+#define MMCR1_PMC2SEL_SH       17
+#define MMCR1_PMC3SEL_SH       9
+#define MMCR1_PMC4SEL_SH       1
+#define MMCR1_PMCSEL_SH(n)     (MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK       0x7f
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *             [  ><><>< ><> <><>[  >  <  ><  ><  ><  ><><><><><><>
+ *             NC  G0G1G2 G3 T0T1 UC    B0  B1  B2  B3 P6P5P4P3P2P1
+ *
+ * NC - number of counters
+ *     51: NC error 0x0008_0000_0000_0000
+ *     48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
+ *
+ * G0..G3 - GRS mux constraints
+ *     46-47: GRS_L2SEL value
+ *     44-45: GRS_L3SEL value
+ *     41-44: GRS_MCSEL value
+ *     39-40: GRS_FABSEL value
+ *     Note that these match up with their bit positions in MMCR1
+ *
+ * T0 - TTM0 constraint
+ *     36-37: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0x30_0000_0000
+ *
+ * T1 - TTM1 constraint
+ *     34-35: TTM1SEL value (0=IDU, 3=GRS) 0x0c_0000_0000
+ *
+ * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
+ *     33: UC3 error 0x02_0000_0000
+ *     32: FPU|IFU|ISU1 events needed 0x01_0000_0000
+ *     31: ISU0 events needed 0x01_8000_0000
+ *     30: IDU|GRS events needed 0x00_4000_0000
+ *
+ * B0
+ *     24-27: Byte 0 event source 0x0f00_0000
+ *           Encoding as for the event code
+ *
+ * B1, B2, B3
+ *     20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
+ *
+ * P6
+ *     11: P6 error 0x800
+ *     10-11: Count of events needing PMC6
+ *
+ * P1..P5
+ *     0-9: Count of events needing PMC1..PMC5
+ */
+
+static const int grsel_shift[8] = {
+       MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
+       MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
+       MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
+};
+
+/* Masks and values for using events from the various units */
+static u64 unit_cons[PM_LASTUNIT+1][2] = {
+       [PM_FPU] =   { 0x3200000000ull, 0x0100000000ull },
+       [PM_ISU0] =  { 0x0200000000ull, 0x0080000000ull },
+       [PM_ISU1] =  { 0x3200000000ull, 0x3100000000ull },
+       [PM_IFU] =   { 0x3200000000ull, 0x2100000000ull },
+       [PM_IDU] =   { 0x0e00000000ull, 0x0040000000ull },
+       [PM_GRS] =   { 0x0e00000000ull, 0x0c40000000ull },
+};
+
+static int power5p_get_constraint(u64 event, u64 *maskp, u64 *valp)
+{
+       int pmc, byte, unit, sh;
+       int bit, fmask;
+       u64 mask = 0, value = 0;
+
+       pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+       if (pmc) {
+               if (pmc > 6)
+                       return -1;
+               sh = (pmc - 1) * 2;
+               mask |= 2 << sh;
+               value |= 1 << sh;
+               if (pmc >= 5 && !(event == 0x500009 || event == 0x600005))
+                       return -1;
+       }
+       if (event & PM_BUSEVENT_MSK) {
+               unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+               if (unit > PM_LASTUNIT)
+                       return -1;
+               if (unit == PM_ISU0_ALT)
+                       unit = PM_ISU0;
+               mask |= unit_cons[unit][0];
+               value |= unit_cons[unit][1];
+               byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+               if (byte >= 4) {
+                       if (unit != PM_LSU1)
+                               return -1;
+                       /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
+                       ++unit;
+                       byte &= 3;
+               }
+               if (unit == PM_GRS) {
+                       bit = event & 7;
+                       fmask = (bit == 6)? 7: 3;
+                       sh = grsel_shift[bit];
+                       mask |= (u64)fmask << sh;
+                       value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
+               }
+               /* Set byte lane select field */
+               mask  |= 0xfULL << (24 - 4 * byte);
+               value |= (u64)unit << (24 - 4 * byte);
+       }
+       if (pmc < 5) {
+               /* need a counter from PMC1-4 set */
+               mask  |= 0x8000000000000ull;
+               value |= 0x1000000000000ull;
+       }
+       *maskp = mask;
+       *valp = value;
+       return 0;
+}
+
+static int power5p_limited_pmc_event(u64 event)
+{
+       int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+
+       return pmc == 5 || pmc == 6;
+}
+
+#define MAX_ALT        3       /* at most 3 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+       { 0x100c0,  0x40001f },                 /* PM_GCT_FULL_CYC */
+       { 0x120e4,  0x400002 },                 /* PM_GRP_DISP_REJECT */
+       { 0x230e2,  0x323087 },                 /* PM_BR_PRED_CR */
+       { 0x230e3,  0x223087, 0x3230a0 },       /* PM_BR_PRED_TA */
+       { 0x410c7,  0x441084 },                 /* PM_THRD_L2MISS_BOTH_CYC */
+       { 0x800c4,  0xc20e0 },                  /* PM_DTLB_MISS */
+       { 0xc50c6,  0xc60e0 },                  /* PM_MRK_DTLB_MISS */
+       { 0x100005, 0x600005 },                 /* PM_RUN_CYC */
+       { 0x100009, 0x200009 },                 /* PM_INST_CMPL */
+       { 0x200015, 0x300015 },                 /* PM_LSU_LMQ_SRQ_EMPTY_CYC */
+       { 0x300009, 0x400009 },                 /* PM_INST_DISP */
+};
+
+/*
+ * Scan the alternatives table for a match and return the
+ * index into the alternatives table if found, else -1.
+ */
+static int find_alternative(unsigned int event)
+{
+       int i, j;
+
+       for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+               if (event < event_alternatives[i][0])
+                       break;
+               for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
+                       if (event == event_alternatives[i][j])
+                               return i;
+       }
+       return -1;
+}
+
+static const unsigned char bytedecode_alternatives[4][4] = {
+       /* PMC 1 */     { 0x21, 0x23, 0x25, 0x27 },
+       /* PMC 2 */     { 0x07, 0x17, 0x0e, 0x1e },
+       /* PMC 3 */     { 0x20, 0x22, 0x24, 0x26 },
+       /* PMC 4 */     { 0x07, 0x17, 0x0e, 0x1e }
+};
+
+/*
+ * Some direct events for decodes of event bus byte 3 have alternative
+ * PMCSEL values on other counters.  This returns the alternative
+ * event code for those that do, or -1 otherwise.  This also handles
+ * alternative PCMSEL values for add events.
+ */
+static s64 find_alternative_bdecode(u64 event)
+{
+       int pmc, altpmc, pp, j;
+
+       pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+       if (pmc == 0 || pmc > 4)
+               return -1;
+       altpmc = 5 - pmc;       /* 1 <-> 4, 2 <-> 3 */
+       pp = event & PM_PMCSEL_MSK;
+       for (j = 0; j < 4; ++j) {
+               if (bytedecode_alternatives[pmc - 1][j] == pp) {
+                       return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
+                               (altpmc << PM_PMC_SH) |
+                               bytedecode_alternatives[altpmc - 1][j];
+               }
+       }
+
+       /* new decode alternatives for power5+ */
+       if (pmc == 1 && (pp == 0x0d || pp == 0x0e))
+               return event + (2 << PM_PMC_SH) + (0x2e - 0x0d);
+       if (pmc == 3 && (pp == 0x2e || pp == 0x2f))
+               return event - (2 << PM_PMC_SH) - (0x2e - 0x0d);
+
+       /* alternative add event encodings */
+       if (pp == 0x10 || pp == 0x28)
+               return ((event ^ (0x10 ^ 0x28)) & ~PM_PMC_MSKS) |
+                       (altpmc << PM_PMC_SH);
+
+       return -1;
+}
+
+static int power5p_get_alternatives(u64 event, unsigned int flags, u64 alt[])
+{
+       int i, j, nalt = 1;
+       int nlim;
+       s64 ae;
+
+       alt[0] = event;
+       nalt = 1;
+       nlim = power5p_limited_pmc_event(event);
+       i = find_alternative(event);
+       if (i >= 0) {
+               for (j = 0; j < MAX_ALT; ++j) {
+                       ae = event_alternatives[i][j];
+                       if (ae && ae != event)
+                               alt[nalt++] = ae;
+                       nlim += power5p_limited_pmc_event(ae);
+               }
+       } else {
+               ae = find_alternative_bdecode(event);
+               if (ae > 0)
+                       alt[nalt++] = ae;
+       }
+
+       if (flags & PPMU_ONLY_COUNT_RUN) {
+               /*
+                * We're only counting in RUN state,
+                * so PM_CYC is equivalent to PM_RUN_CYC
+                * and PM_INST_CMPL === PM_RUN_INST_CMPL.
+                * This doesn't include alternatives that don't provide
+                * any extra flexibility in assigning PMCs (e.g.
+                * 0x100005 for PM_RUN_CYC vs. 0xf for PM_CYC).
+                * Note that even with these additional alternatives
+                * we never end up with more than 3 alternatives for any event.
+                */
+               j = nalt;
+               for (i = 0; i < nalt; ++i) {
+                       switch (alt[i]) {
+                       case 0xf:       /* PM_CYC */
+                               alt[j++] = 0x600005;    /* PM_RUN_CYC */
+                               ++nlim;
+                               break;
+                       case 0x600005:  /* PM_RUN_CYC */
+                               alt[j++] = 0xf;
+                               break;
+                       case 0x100009:  /* PM_INST_CMPL */
+                               alt[j++] = 0x500009;    /* PM_RUN_INST_CMPL */
+                               ++nlim;
+                               break;
+                       case 0x500009:  /* PM_RUN_INST_CMPL */
+                               alt[j++] = 0x100009;    /* PM_INST_CMPL */
+                               alt[j++] = 0x200009;
+                               break;
+                       }
+               }
+               nalt = j;
+       }
+
+       if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
+               /* remove the limited PMC events */
+               j = 0;
+               for (i = 0; i < nalt; ++i) {
+                       if (!power5p_limited_pmc_event(alt[i])) {
+                               alt[j] = alt[i];
+                               ++j;
+                       }
+               }
+               nalt = j;
+       } else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
+               /* remove all but the limited PMC events */
+               j = 0;
+               for (i = 0; i < nalt; ++i) {
+                       if (power5p_limited_pmc_event(alt[i])) {
+                               alt[j] = alt[i];
+                               ++j;
+                       }
+               }
+               nalt = j;
+       }
+
+       return nalt;
+}
+
+/*
+ * Map of which direct events on which PMCs are marked instruction events.
+ * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
+ * Bit 0 is set if it is marked for all PMCs.
+ * The 0x80 bit indicates a byte decode PMCSEL value.
+ */
+static unsigned char direct_event_is_marked[0x28] = {
+       0,      /* 00 */
+       0x1f,   /* 01 PM_IOPS_CMPL */
+       0x2,    /* 02 PM_MRK_GRP_DISP */
+       0xe,    /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
+       0,      /* 04 */
+       0x1c,   /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
+       0x80,   /* 06 */
+       0x80,   /* 07 */
+       0, 0, 0,/* 08 - 0a */
+       0x18,   /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
+       0,      /* 0c */
+       0x80,   /* 0d */
+       0x80,   /* 0e */
+       0,      /* 0f */
+       0,      /* 10 */
+       0x14,   /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
+       0,      /* 12 */
+       0x10,   /* 13 PM_MRK_GRP_CMPL */
+       0x1f,   /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
+       0x2,    /* 15 PM_MRK_GRP_ISSUED */
+       0x80,   /* 16 */
+       0x80,   /* 17 */
+       0, 0, 0, 0, 0,
+       0x80,   /* 1d */
+       0x80,   /* 1e */
+       0,      /* 1f */
+       0x80,   /* 20 */
+       0x80,   /* 21 */
+       0x80,   /* 22 */
+       0x80,   /* 23 */
+       0x80,   /* 24 */
+       0x80,   /* 25 */
+       0x80,   /* 26 */
+       0x80,   /* 27 */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power5p_marked_instr_event(u64 event)
+{
+       int pmc, psel;
+       int bit, byte, unit;
+       u32 mask;
+
+       pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+       psel = event & PM_PMCSEL_MSK;
+       if (pmc >= 5)
+               return 0;
+
+       bit = -1;
+       if (psel < sizeof(direct_event_is_marked)) {
+               if (direct_event_is_marked[psel] & (1 << pmc))
+                       return 1;
+               if (direct_event_is_marked[psel] & 0x80)
+                       bit = 4;
+               else if (psel == 0x08)
+                       bit = pmc - 1;
+               else if (psel == 0x10)
+                       bit = 4 - pmc;
+               else if (psel == 0x1b && (pmc == 1 || pmc == 3))
+                       bit = 4;
+       } else if ((psel & 0x48) == 0x40) {
+               bit = psel & 7;
+       } else if (psel == 0x28) {
+               bit = pmc - 1;
+       } else if (pmc == 3 && (psel == 0x2e || psel == 0x2f)) {
+               bit = 4;
+       }
+
+       if (!(event & PM_BUSEVENT_MSK) || bit == -1)
+               return 0;
+
+       byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+       unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+       if (unit == PM_LSU0) {
+               /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
+               mask = 0x5dff00;
+       } else if (unit == PM_LSU1 && byte >= 4) {
+               byte -= 4;
+               /* byte 5 bits 6-7, byte 6 bits 0,4, byte 7 bits 0-4,6 */
+               mask = 0x5f11c000;
+       } else
+               return 0;
+
+       return (mask >> (byte * 8 + bit)) & 1;
+}
+
+static int power5p_compute_mmcr(u64 event[], int n_ev,
+                               unsigned int hwc[], u64 mmcr[])
+{
+       u64 mmcr1 = 0;
+       u64 mmcra = 0;
+       unsigned int pmc, unit, byte, psel;
+       unsigned int ttm;
+       int i, isbus, bit, grsel;
+       unsigned int pmc_inuse = 0;
+       unsigned char busbyte[4];
+       unsigned char unituse[16];
+       int ttmuse;
+
+       if (n_ev > 6)
+               return -1;
+
+       /* First pass to count resource use */
+       memset(busbyte, 0, sizeof(busbyte));
+       memset(unituse, 0, sizeof(unituse));
+       for (i = 0; i < n_ev; ++i) {
+               pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+               if (pmc) {
+                       if (pmc > 6)
+                               return -1;
+                       if (pmc_inuse & (1 << (pmc - 1)))
+                               return -1;
+                       pmc_inuse |= 1 << (pmc - 1);
+               }
+               if (event[i] & PM_BUSEVENT_MSK) {
+                       unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+                       byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+                       if (unit > PM_LASTUNIT)
+                               return -1;
+                       if (unit == PM_ISU0_ALT)
+                               unit = PM_ISU0;
+                       if (byte >= 4) {
+                               if (unit != PM_LSU1)
+                                       return -1;
+                               ++unit;
+                               byte &= 3;
+                       }
+                       if (busbyte[byte] && busbyte[byte] != unit)
+                               return -1;
+                       busbyte[byte] = unit;
+                       unituse[unit] = 1;
+               }
+       }
+
+       /*
+        * Assign resources and set multiplexer selects.
+        *
+        * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
+        * choice we have to deal with.
+        */
+       if (unituse[PM_ISU0] &
+           (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
+               unituse[PM_ISU0_ALT] = 1;       /* move ISU to TTM1 */
+               unituse[PM_ISU0] = 0;
+       }
+       /* Set TTM[01]SEL fields. */
+       ttmuse = 0;
+       for (i = PM_FPU; i <= PM_ISU1; ++i) {
+               if (!unituse[i])
+                       continue;
+               if (ttmuse++)
+                       return -1;
+               mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
+       }
+       ttmuse = 0;
+       for (; i <= PM_GRS; ++i) {
+               if (!unituse[i])
+                       continue;
+               if (ttmuse++)
+                       return -1;
+               mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
+       }
+       if (ttmuse > 1)
+               return -1;
+
+       /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
+       for (byte = 0; byte < 4; ++byte) {
+               unit = busbyte[byte];
+               if (!unit)
+                       continue;
+               if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
+                       /* get ISU0 through TTM1 rather than TTM0 */
+                       unit = PM_ISU0_ALT;
+               } else if (unit == PM_LSU1 + 1) {
+                       /* select lower word of LSU1 for this byte */
+                       mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
+               }
+               ttm = unit >> 2;
+               mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
+       }
+
+       /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+       for (i = 0; i < n_ev; ++i) {
+               pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+               unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+               byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+               psel = event[i] & PM_PMCSEL_MSK;
+               isbus = event[i] & PM_BUSEVENT_MSK;
+               if (!pmc) {
+                       /* Bus event or any-PMC direct event */
+                       for (pmc = 0; pmc < 4; ++pmc) {
+                               if (!(pmc_inuse & (1 << pmc)))
+                                       break;
+                       }
+                       if (pmc >= 4)
+                               return -1;
+                       pmc_inuse |= 1 << pmc;
+               } else if (pmc <= 4) {
+                       /* Direct event */
+                       --pmc;
+                       if (isbus && (byte & 2) &&
+                           (psel == 8 || psel == 0x10 || psel == 0x28))
+                               /* add events on higher-numbered bus */
+                               mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
+               } else {
+                       /* Instructions or run cycles on PMC5/6 */
+                       --pmc;
+               }
+               if (isbus && unit == PM_GRS) {
+                       bit = psel & 7;
+                       grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
+                       mmcr1 |= (u64)grsel << grsel_shift[bit];
+               }
+               if (power5p_marked_instr_event(event[i]))
+                       mmcra |= MMCRA_SAMPLE_ENABLE;
+               if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1))
+                       /* select alternate byte lane */
+                       psel |= 0x10;
+               if (pmc <= 3)
+                       mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
+               hwc[i] = pmc;
+       }
+
+       /* Return MMCRx values */
+       mmcr[0] = 0;
+       if (pmc_inuse & 1)
+               mmcr[0] = MMCR0_PMC1CE;
+       if (pmc_inuse & 0x3e)
+               mmcr[0] |= MMCR0_PMCjCE;
+       mmcr[1] = mmcr1;
+       mmcr[2] = mmcra;
+       return 0;
+}
+
+static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+       if (pmc <= 3)
+               mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power5p_generic_events[] = {
+       [PERF_COUNT_HW_CPU_CYCLES]              = 0xf,
+       [PERF_COUNT_HW_INSTRUCTIONS]            = 0x100009,
+       [PERF_COUNT_HW_CACHE_REFERENCES]        = 0x1c10a8, /* LD_REF_L1 */
+       [PERF_COUNT_HW_CACHE_MISSES]            = 0x3c1088, /* LD_MISS_L1 */
+       [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]     = 0x230e4,  /* BR_ISSUED */
+       [PERF_COUNT_HW_BRANCH_MISSES]           = 0x230e5,  /* BR_MPRED_CR */
+};
+
+#define C(x)   PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+       [C(L1D)] = {            /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0x1c10a8,       0x3c1088        },
+               [C(OP_WRITE)] = {       0x2c10a8,       0xc10c3         },
+               [C(OP_PREFETCH)] = {    0xc70e7,        -1              },
+       },
+       [C(L1I)] = {            /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0               },
+               [C(OP_WRITE)] = {       -1,             -1              },
+               [C(OP_PREFETCH)] = {    0,              0               },
+       },
+       [C(LL)] = {             /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0               },
+               [C(OP_WRITE)] = {       0,              0               },
+               [C(OP_PREFETCH)] = {    0xc50c3,        0               },
+       },
+       [C(DTLB)] = {           /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0xc20e4,        0x800c4         },
+               [C(OP_WRITE)] = {       -1,             -1              },
+               [C(OP_PREFETCH)] = {    -1,             -1              },
+       },
+       [C(ITLB)] = {           /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0x800c0         },
+               [C(OP_WRITE)] = {       -1,             -1              },
+               [C(OP_PREFETCH)] = {    -1,             -1              },
+       },
+       [C(BPU)] = {            /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0x230e4,        0x230e5         },
+               [C(OP_WRITE)] = {       -1,             -1              },
+               [C(OP_PREFETCH)] = {    -1,             -1              },
+       },
+};
+
+struct power_pmu power5p_pmu = {
+       .n_counter = 6,
+       .max_alternatives = MAX_ALT,
+       .add_fields = 0x7000000000055ull,
+       .test_adder = 0x3000040000000ull,
+       .compute_mmcr = power5p_compute_mmcr,
+       .get_constraint = power5p_get_constraint,
+       .get_alternatives = power5p_get_alternatives,
+       .disable_pmc = power5p_disable_pmc,
+       .limited_pmc_event = power5p_limited_pmc_event,
+       .flags = PPMU_LIMITED_PMC5_6,
+       .n_generic = ARRAY_SIZE(power5p_generic_events),
+       .generic_events = power5p_generic_events,
+       .cache_events = &power5p_cache_events,
+};
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c

new file mode 100644 (file)

index 0000000..05600b6
--- /dev/null
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -0,0 +1,611 @@
+/*
+ * Performance counter support for POWER5 (not POWER5++) processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER5 (not POWER5++)
+ */
+#define PM_PMC_SH      20      /* PMC number (1-based) for direct events */
+#define PM_PMC_MSK     0xf
+#define PM_PMC_MSKS    (PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH     16      /* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK    0xf
+#define PM_BYTE_SH     12      /* Byte number of event bus to use */
+#define PM_BYTE_MSK    7
+#define PM_GRS_SH      8       /* Storage subsystem mux select */
+#define PM_GRS_MSK     7
+#define PM_BUSEVENT_MSK        0x80    /* Set if event uses event bus */
+#define PM_PMCSEL_MSK  0x7f
+
+/* Values in PM_UNIT field */
+#define PM_FPU         0
+#define PM_ISU0                1
+#define PM_IFU         2
+#define PM_ISU1                3
+#define PM_IDU         4
+#define PM_ISU0_ALT    6
+#define PM_GRS         7
+#define PM_LSU0                8
+#define PM_LSU1                0xc
+#define PM_LASTUNIT    0xc
+
+/*
+ * Bits in MMCR1 for POWER5
+ */
+#define MMCR1_TTM0SEL_SH       62
+#define MMCR1_TTM1SEL_SH       60
+#define MMCR1_TTM2SEL_SH       58
+#define MMCR1_TTM3SEL_SH       56
+#define MMCR1_TTMSEL_MSK       3
+#define MMCR1_TD_CP_DBG0SEL_SH 54
+#define MMCR1_TD_CP_DBG1SEL_SH 52
+#define MMCR1_TD_CP_DBG2SEL_SH 50
+#define MMCR1_TD_CP_DBG3SEL_SH 48
+#define MMCR1_GRS_L2SEL_SH     46
+#define MMCR1_GRS_L2SEL_MSK    3
+#define MMCR1_GRS_L3SEL_SH     44
+#define MMCR1_GRS_L3SEL_MSK    3
+#define MMCR1_GRS_MCSEL_SH     41
+#define MMCR1_GRS_MCSEL_MSK    7
+#define MMCR1_GRS_FABSEL_SH    39
+#define MMCR1_GRS_FABSEL_MSK   3
+#define MMCR1_PMC1_ADDER_SEL_SH        35
+#define MMCR1_PMC2_ADDER_SEL_SH        34
+#define MMCR1_PMC3_ADDER_SEL_SH        33
+#define MMCR1_PMC4_ADDER_SEL_SH        32
+#define MMCR1_PMC1SEL_SH       25
+#define MMCR1_PMC2SEL_SH       17
+#define MMCR1_PMC3SEL_SH       9
+#define MMCR1_PMC4SEL_SH       1
+#define MMCR1_PMCSEL_SH(n)     (MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK       0x7f
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *         <><>[  ><><>< ><> [  >[ >[ ><  ><  ><  ><  ><><><><><><>
+ *         T0T1 NC G0G1G2 G3  UC PS1PS2 B0  B1  B2  B3 P6P5P4P3P2P1
+ *
+ * T0 - TTM0 constraint
+ *     54-55: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0xc0_0000_0000_0000
+ *
+ * T1 - TTM1 constraint
+ *     52-53: TTM1SEL value (0=IDU, 3=GRS) 0x30_0000_0000_0000
+ *
+ * NC - number of counters
+ *     51: NC error 0x0008_0000_0000_0000
+ *     48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
+ *
+ * G0..G3 - GRS mux constraints
+ *     46-47: GRS_L2SEL value
+ *     44-45: GRS_L3SEL value
+ *     41-44: GRS_MCSEL value
+ *     39-40: GRS_FABSEL value
+ *     Note that these match up with their bit positions in MMCR1
+ *
+ * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
+ *     37: UC3 error 0x20_0000_0000
+ *     36: FPU|IFU|ISU1 events needed 0x10_0000_0000
+ *     35: ISU0 events needed 0x08_0000_0000
+ *     34: IDU|GRS events needed 0x04_0000_0000
+ *
+ * PS1
+ *     33: PS1 error 0x2_0000_0000
+ *     31-32: count of events needing PMC1/2 0x1_8000_0000
+ *
+ * PS2
+ *     30: PS2 error 0x4000_0000
+ *     28-29: count of events needing PMC3/4 0x3000_0000
+ *
+ * B0
+ *     24-27: Byte 0 event source 0x0f00_0000
+ *           Encoding as for the event code
+ *
+ * B1, B2, B3
+ *     20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
+ *
+ * P1..P6
+ *     0-11: Count of events needing PMC1..PMC6
+ */
+
+static const int grsel_shift[8] = {
+       MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
+       MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
+       MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
+};
+
+/* Masks and values for using events from the various units */
+static u64 unit_cons[PM_LASTUNIT+1][2] = {
+       [PM_FPU] =   { 0xc0002000000000ull, 0x00001000000000ull },
+       [PM_ISU0] =  { 0x00002000000000ull, 0x00000800000000ull },
+       [PM_ISU1] =  { 0xc0002000000000ull, 0xc0001000000000ull },
+       [PM_IFU] =   { 0xc0002000000000ull, 0x80001000000000ull },
+       [PM_IDU] =   { 0x30002000000000ull, 0x00000400000000ull },
+       [PM_GRS] =   { 0x30002000000000ull, 0x30000400000000ull },
+};
+
+static int power5_get_constraint(u64 event, u64 *maskp, u64 *valp)
+{
+       int pmc, byte, unit, sh;
+       int bit, fmask;
+       u64 mask = 0, value = 0;
+       int grp = -1;
+
+       pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+       if (pmc) {
+               if (pmc > 6)
+                       return -1;
+               sh = (pmc - 1) * 2;
+               mask |= 2 << sh;
+               value |= 1 << sh;
+               if (pmc <= 4)
+                       grp = (pmc - 1) >> 1;
+               else if (event != 0x500009 && event != 0x600005)
+                       return -1;
+       }
+       if (event & PM_BUSEVENT_MSK) {
+               unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+               if (unit > PM_LASTUNIT)
+                       return -1;
+               if (unit == PM_ISU0_ALT)
+                       unit = PM_ISU0;
+               mask |= unit_cons[unit][0];
+               value |= unit_cons[unit][1];
+               byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+               if (byte >= 4) {
+                       if (unit != PM_LSU1)
+                               return -1;
+                       /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
+                       ++unit;
+                       byte &= 3;
+               }
+               if (unit == PM_GRS) {
+                       bit = event & 7;
+                       fmask = (bit == 6)? 7: 3;
+                       sh = grsel_shift[bit];
+                       mask |= (u64)fmask << sh;
+                       value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
+               }
+               /*
+                * Bus events on bytes 0 and 2 can be counted
+                * on PMC1/2; bytes 1 and 3 on PMC3/4.
+                */
+               if (!pmc)
+                       grp = byte & 1;
+               /* Set byte lane select field */
+               mask  |= 0xfULL << (24 - 4 * byte);
+               value |= (u64)unit << (24 - 4 * byte);
+       }
+       if (grp == 0) {
+               /* increment PMC1/2 field */
+               mask  |= 0x200000000ull;
+               value |= 0x080000000ull;
+       } else if (grp == 1) {
+               /* increment PMC3/4 field */
+               mask  |= 0x40000000ull;
+               value |= 0x10000000ull;
+       }
+       if (pmc < 5) {
+               /* need a counter from PMC1-4 set */
+               mask  |= 0x8000000000000ull;
+               value |= 0x1000000000000ull;
+       }
+       *maskp = mask;
+       *valp = value;
+       return 0;
+}
+
+#define MAX_ALT        3       /* at most 3 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+       { 0x120e4,  0x400002 },                 /* PM_GRP_DISP_REJECT */
+       { 0x410c7,  0x441084 },                 /* PM_THRD_L2MISS_BOTH_CYC */
+       { 0x100005, 0x600005 },                 /* PM_RUN_CYC */
+       { 0x100009, 0x200009, 0x500009 },       /* PM_INST_CMPL */
+       { 0x300009, 0x400009 },                 /* PM_INST_DISP */
+};
+
+/*
+ * Scan the alternatives table for a match and return the
+ * index into the alternatives table if found, else -1.
+ */
+static int find_alternative(u64 event)
+{
+       int i, j;
+
+       for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+               if (event < event_alternatives[i][0])
+                       break;
+               for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
+                       if (event == event_alternatives[i][j])
+                               return i;
+       }
+       return -1;
+}
+
+static const unsigned char bytedecode_alternatives[4][4] = {
+       /* PMC 1 */     { 0x21, 0x23, 0x25, 0x27 },
+       /* PMC 2 */     { 0x07, 0x17, 0x0e, 0x1e },
+       /* PMC 3 */     { 0x20, 0x22, 0x24, 0x26 },
+       /* PMC 4 */     { 0x07, 0x17, 0x0e, 0x1e }
+};
+
+/*
+ * Some direct events for decodes of event bus byte 3 have alternative
+ * PMCSEL values on other counters.  This returns the alternative
+ * event code for those that do, or -1 otherwise.
+ */
+static s64 find_alternative_bdecode(u64 event)
+{
+       int pmc, altpmc, pp, j;
+
+       pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+       if (pmc == 0 || pmc > 4)
+               return -1;
+       altpmc = 5 - pmc;       /* 1 <-> 4, 2 <-> 3 */
+       pp = event & PM_PMCSEL_MSK;
+       for (j = 0; j < 4; ++j) {
+               if (bytedecode_alternatives[pmc - 1][j] == pp) {
+                       return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
+                               (altpmc << PM_PMC_SH) |
+                               bytedecode_alternatives[altpmc - 1][j];
+               }
+       }
+       return -1;
+}
+
+static int power5_get_alternatives(u64 event, unsigned int flags, u64 alt[])
+{
+       int i, j, nalt = 1;
+       s64 ae;
+
+       alt[0] = event;
+       nalt = 1;
+       i = find_alternative(event);
+       if (i >= 0) {
+               for (j = 0; j < MAX_ALT; ++j) {
+                       ae = event_alternatives[i][j];
+                       if (ae && ae != event)
+                               alt[nalt++] = ae;
+               }
+       } else {
+               ae = find_alternative_bdecode(event);
+               if (ae > 0)
+                       alt[nalt++] = ae;
+       }
+       return nalt;
+}
+
+/*
+ * Map of which direct events on which PMCs are marked instruction events.
+ * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
+ * Bit 0 is set if it is marked for all PMCs.
+ * The 0x80 bit indicates a byte decode PMCSEL value.
+ */
+static unsigned char direct_event_is_marked[0x28] = {
+       0,      /* 00 */
+       0x1f,   /* 01 PM_IOPS_CMPL */
+       0x2,    /* 02 PM_MRK_GRP_DISP */
+       0xe,    /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
+       0,      /* 04 */
+       0x1c,   /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
+       0x80,   /* 06 */
+       0x80,   /* 07 */
+       0, 0, 0,/* 08 - 0a */
+       0x18,   /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
+       0,      /* 0c */
+       0x80,   /* 0d */
+       0x80,   /* 0e */
+       0,      /* 0f */
+       0,      /* 10 */
+       0x14,   /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
+       0,      /* 12 */
+       0x10,   /* 13 PM_MRK_GRP_CMPL */
+       0x1f,   /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
+       0x2,    /* 15 PM_MRK_GRP_ISSUED */
+       0x80,   /* 16 */
+       0x80,   /* 17 */
+       0, 0, 0, 0, 0,
+       0x80,   /* 1d */
+       0x80,   /* 1e */
+       0,      /* 1f */
+       0x80,   /* 20 */
+       0x80,   /* 21 */
+       0x80,   /* 22 */
+       0x80,   /* 23 */
+       0x80,   /* 24 */
+       0x80,   /* 25 */
+       0x80,   /* 26 */
+       0x80,   /* 27 */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power5_marked_instr_event(u64 event)
+{
+       int pmc, psel;
+       int bit, byte, unit;
+       u32 mask;
+
+       pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+       psel = event & PM_PMCSEL_MSK;
+       if (pmc >= 5)
+               return 0;
+
+       bit = -1;
+       if (psel < sizeof(direct_event_is_marked)) {
+               if (direct_event_is_marked[psel] & (1 << pmc))
+                       return 1;
+               if (direct_event_is_marked[psel] & 0x80)
+                       bit = 4;
+               else if (psel == 0x08)
+                       bit = pmc - 1;
+               else if (psel == 0x10)
+                       bit = 4 - pmc;
+               else if (psel == 0x1b && (pmc == 1 || pmc == 3))
+                       bit = 4;
+       } else if ((psel & 0x58) == 0x40)
+               bit = psel & 7;
+
+       if (!(event & PM_BUSEVENT_MSK))
+               return 0;
+
+       byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+       unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+       if (unit == PM_LSU0) {
+               /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
+               mask = 0x5dff00;
+       } else if (unit == PM_LSU1 && byte >= 4) {
+               byte -= 4;
+               /* byte 4 bits 1,3,5,7, byte 5 bits 6-7, byte 7 bits 0-4,6 */
+               mask = 0x5f00c0aa;
+       } else
+               return 0;
+
+       return (mask >> (byte * 8 + bit)) & 1;
+}
+
+static int power5_compute_mmcr(u64 event[], int n_ev,
+                              unsigned int hwc[], u64 mmcr[])
+{
+       u64 mmcr1 = 0;
+       u64 mmcra = 0;
+       unsigned int pmc, unit, byte, psel;
+       unsigned int ttm, grp;
+       int i, isbus, bit, grsel;
+       unsigned int pmc_inuse = 0;
+       unsigned int pmc_grp_use[2];
+       unsigned char busbyte[4];
+       unsigned char unituse[16];
+       int ttmuse;
+
+       if (n_ev > 6)
+               return -1;
+
+       /* First pass to count resource use */
+       pmc_grp_use[0] = pmc_grp_use[1] = 0;
+       memset(busbyte, 0, sizeof(busbyte));
+       memset(unituse, 0, sizeof(unituse));
+       for (i = 0; i < n_ev; ++i) {
+               pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+               if (pmc) {
+                       if (pmc > 6)
+                               return -1;
+                       if (pmc_inuse & (1 << (pmc - 1)))
+                               return -1;
+                       pmc_inuse |= 1 << (pmc - 1);
+                       /* count 1/2 vs 3/4 use */
+                       if (pmc <= 4)
+                               ++pmc_grp_use[(pmc - 1) >> 1];
+               }
+               if (event[i] & PM_BUSEVENT_MSK) {
+                       unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+                       byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+                       if (unit > PM_LASTUNIT)
+                               return -1;
+                       if (unit == PM_ISU0_ALT)
+                               unit = PM_ISU0;
+                       if (byte >= 4) {
+                               if (unit != PM_LSU1)
+                                       return -1;
+                               ++unit;
+                               byte &= 3;
+                       }
+                       if (!pmc)
+                               ++pmc_grp_use[byte & 1];
+                       if (busbyte[byte] && busbyte[byte] != unit)
+                               return -1;
+                       busbyte[byte] = unit;
+                       unituse[unit] = 1;
+               }
+       }
+       if (pmc_grp_use[0] > 2 || pmc_grp_use[1] > 2)
+               return -1;
+
+       /*
+        * Assign resources and set multiplexer selects.
+        *
+        * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
+        * choice we have to deal with.
+        */
+       if (unituse[PM_ISU0] &
+           (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
+               unituse[PM_ISU0_ALT] = 1;       /* move ISU to TTM1 */
+               unituse[PM_ISU0] = 0;
+       }
+       /* Set TTM[01]SEL fields. */
+       ttmuse = 0;
+       for (i = PM_FPU; i <= PM_ISU1; ++i) {
+               if (!unituse[i])
+                       continue;
+               if (ttmuse++)
+                       return -1;
+               mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
+       }
+       ttmuse = 0;
+       for (; i <= PM_GRS; ++i) {
+               if (!unituse[i])
+                       continue;
+               if (ttmuse++)
+                       return -1;
+               mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
+       }
+       if (ttmuse > 1)
+               return -1;
+
+       /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
+       for (byte = 0; byte < 4; ++byte) {
+               unit = busbyte[byte];
+               if (!unit)
+                       continue;
+               if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
+                       /* get ISU0 through TTM1 rather than TTM0 */
+                       unit = PM_ISU0_ALT;
+               } else if (unit == PM_LSU1 + 1) {
+                       /* select lower word of LSU1 for this byte */
+                       mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
+               }
+               ttm = unit >> 2;
+               mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
+       }
+
+       /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+       for (i = 0; i < n_ev; ++i) {
+               pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+               unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+               byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+               psel = event[i] & PM_PMCSEL_MSK;
+               isbus = event[i] & PM_BUSEVENT_MSK;
+               if (!pmc) {
+                       /* Bus event or any-PMC direct event */
+                       for (pmc = 0; pmc < 4; ++pmc) {
+                               if (pmc_inuse & (1 << pmc))
+                                       continue;
+                               grp = (pmc >> 1) & 1;
+                               if (isbus) {
+                                       if (grp == (byte & 1))
+                                               break;
+                               } else if (pmc_grp_use[grp] < 2) {
+                                       ++pmc_grp_use[grp];
+                                       break;
+                               }
+                       }
+                       pmc_inuse |= 1 << pmc;
+               } else if (pmc <= 4) {
+                       /* Direct event */
+                       --pmc;
+                       if ((psel == 8 || psel == 0x10) && isbus && (byte & 2))
+                               /* add events on higher-numbered bus */
+                               mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
+               } else {
+                       /* Instructions or run cycles on PMC5/6 */
+                       --pmc;
+               }
+               if (isbus && unit == PM_GRS) {
+                       bit = psel & 7;
+                       grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
+                       mmcr1 |= (u64)grsel << grsel_shift[bit];
+               }
+               if (power5_marked_instr_event(event[i]))
+                       mmcra |= MMCRA_SAMPLE_ENABLE;
+               if (pmc <= 3)
+                       mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
+               hwc[i] = pmc;
+       }
+
+       /* Return MMCRx values */
+       mmcr[0] = 0;
+       if (pmc_inuse & 1)
+               mmcr[0] = MMCR0_PMC1CE;
+       if (pmc_inuse & 0x3e)
+               mmcr[0] |= MMCR0_PMCjCE;
+       mmcr[1] = mmcr1;
+       mmcr[2] = mmcra;
+       return 0;
+}
+
+static void power5_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+       if (pmc <= 3)
+               mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power5_generic_events[] = {
+       [PERF_COUNT_HW_CPU_CYCLES]              = 0xf,
+       [PERF_COUNT_HW_INSTRUCTIONS]            = 0x100009,
+       [PERF_COUNT_HW_CACHE_REFERENCES]        = 0x4c1090, /* LD_REF_L1 */
+       [PERF_COUNT_HW_CACHE_MISSES]            = 0x3c1088, /* LD_MISS_L1 */
+       [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]     = 0x230e4,  /* BR_ISSUED */
+       [PERF_COUNT_HW_BRANCH_MISSES]           = 0x230e5,  /* BR_MPRED_CR */
+};
+
+#define C(x)   PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+       [C(L1D)] = {            /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0x4c1090,       0x3c1088        },
+               [C(OP_WRITE)] = {       0x3c1090,       0xc10c3         },
+               [C(OP_PREFETCH)] = {    0xc70e7,        0               },
+       },
+       [C(L1I)] = {            /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0               },
+               [C(OP_WRITE)] = {       -1,             -1              },
+               [C(OP_PREFETCH)] = {    0,              0               },
+       },
+       [C(LL)] = {             /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0x3c309b        },
+               [C(OP_WRITE)] = {       0,              0               },
+               [C(OP_PREFETCH)] = {    0xc50c3,        0               },
+       },
+       [C(DTLB)] = {           /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0x2c4090,       0x800c4         },
+               [C(OP_WRITE)] = {       -1,             -1              },
+               [C(OP_PREFETCH)] = {    -1,             -1              },
+       },
+       [C(ITLB)] = {           /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0x800c0         },
+               [C(OP_WRITE)] = {       -1,             -1              },
+               [C(OP_PREFETCH)] = {    -1,             -1              },
+       },
+       [C(BPU)] = {            /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0x230e4,        0x230e5         },
+               [C(OP_WRITE)] = {       -1,             -1              },
+               [C(OP_PREFETCH)] = {    -1,             -1              },
+       },
+};
+
+struct power_pmu power5_pmu = {
+       .n_counter = 6,
+       .max_alternatives = MAX_ALT,
+       .add_fields = 0x7000090000555ull,
+       .test_adder = 0x3000490000000ull,
+       .compute_mmcr = power5_compute_mmcr,
+       .get_constraint = power5_get_constraint,
+       .get_alternatives = power5_get_alternatives,
+       .disable_pmc = power5_disable_pmc,
+       .n_generic = ARRAY_SIZE(power5_generic_events),
+       .generic_events = power5_generic_events,
+       .cache_events = &power5_cache_events,
+};
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c

new file mode 100644 (file)

index 0000000..46f74be
--- /dev/null
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -0,0 +1,532 @@
+/*
+ * Performance counter support for POWER6 processors.
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER6
+ */
+#define PM_PMC_SH      20      /* PMC number (1-based) for direct events */
+#define PM_PMC_MSK     0x7
+#define PM_PMC_MSKS    (PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH     16      /* Unit event comes (TTMxSEL encoding) */
+#define PM_UNIT_MSK    0xf
+#define PM_UNIT_MSKS   (PM_UNIT_MSK << PM_UNIT_SH)
+#define PM_LLAV                0x8000  /* Load lookahead match value */
+#define PM_LLA         0x4000  /* Load lookahead match enable */
+#define PM_BYTE_SH     12      /* Byte of event bus to use */
+#define PM_BYTE_MSK    3
+#define PM_SUBUNIT_SH  8       /* Subunit event comes from (NEST_SEL enc.) */
+#define PM_SUBUNIT_MSK 7
+#define PM_SUBUNIT_MSKS        (PM_SUBUNIT_MSK << PM_SUBUNIT_SH)
+#define PM_PMCSEL_MSK  0xff    /* PMCxSEL value */
+#define PM_BUSEVENT_MSK        0xf3700
+
+/*
+ * Bits in MMCR1 for POWER6
+ */
+#define MMCR1_TTM0SEL_SH       60
+#define MMCR1_TTMSEL_SH(n)     (MMCR1_TTM0SEL_SH - (n) * 4)
+#define MMCR1_TTMSEL_MSK       0xf
+#define MMCR1_TTMSEL(m, n)     (((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK)
+#define MMCR1_NESTSEL_SH       45
+#define MMCR1_NESTSEL_MSK      0x7
+#define MMCR1_NESTSEL(m)       (((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK)
+#define MMCR1_PMC1_LLA         ((u64)1 << 44)
+#define MMCR1_PMC1_LLA_VALUE   ((u64)1 << 39)
+#define MMCR1_PMC1_ADDR_SEL    ((u64)1 << 35)
+#define MMCR1_PMC1SEL_SH       24
+#define MMCR1_PMCSEL_SH(n)     (MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK       0xff
+
+/*
+ * Map of which direct events on which PMCs are marked instruction events.
+ * Indexed by PMCSEL value >> 1.
+ * Bottom 4 bits are a map of which PMCs are interesting,
+ * top 4 bits say what sort of event:
+ *   0 = direct marked event,
+ *   1 = byte decode event,
+ *   4 = add/and event (PMC1 -> bits 0 & 4),
+ *   5 = add/and event (PMC1 -> bits 1 & 5),
+ *   6 = add/and event (PMC1 -> bits 2 & 6),
+ *   7 = add/and event (PMC1 -> bits 3 & 7).
+ */
+static unsigned char direct_event_is_marked[0x60 >> 1] = {
+       0,      /* 00 */
+       0,      /* 02 */
+       0,      /* 04 */
+       0x07,   /* 06 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
+       0x04,   /* 08 PM_MRK_DFU_FIN */
+       0x06,   /* 0a PM_MRK_IFU_FIN, PM_MRK_INST_FIN */
+       0,      /* 0c */
+       0,      /* 0e */
+       0x02,   /* 10 PM_MRK_INST_DISP */
+       0x08,   /* 12 PM_MRK_LSU_DERAT_MISS */
+       0,      /* 14 */
+       0,      /* 16 */
+       0x0c,   /* 18 PM_THRESH_TIMEO, PM_MRK_INST_FIN */
+       0x0f,   /* 1a PM_MRK_INST_DISP, PM_MRK_{FXU,FPU,LSU}_FIN */
+       0x01,   /* 1c PM_MRK_INST_ISSUED */
+       0,      /* 1e */
+       0,      /* 20 */
+       0,      /* 22 */
+       0,      /* 24 */
+       0,      /* 26 */
+       0x15,   /* 28 PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L3MISS */
+       0,      /* 2a */
+       0,      /* 2c */
+       0,      /* 2e */
+       0x4f,   /* 30 */
+       0x7f,   /* 32 */
+       0x4f,   /* 34 */
+       0x5f,   /* 36 */
+       0x6f,   /* 38 */
+       0x4f,   /* 3a */
+       0,      /* 3c */
+       0x08,   /* 3e PM_MRK_INST_TIMEO */
+       0x1f,   /* 40 */
+       0x1f,   /* 42 */
+       0x1f,   /* 44 */
+       0x1f,   /* 46 */
+       0x1f,   /* 48 */
+       0x1f,   /* 4a */
+       0x1f,   /* 4c */
+       0x1f,   /* 4e */
+       0,      /* 50 */
+       0x05,   /* 52 PM_MRK_BR_TAKEN, PM_MRK_BR_MPRED */
+       0x1c,   /* 54 PM_MRK_PTEG_FROM_L3MISS, PM_MRK_PTEG_FROM_L2MISS */
+       0x02,   /* 56 PM_MRK_LD_MISS_L1 */
+       0,      /* 58 */
+       0,      /* 5a */
+       0,      /* 5c */
+       0,      /* 5e */
+};
+
+/*
+ * Masks showing for each unit which bits are marked events.
+ * These masks are in LE order, i.e. 0x00000001 is byte 0, bit 0.
+ */
+static u32 marked_bus_events[16] = {
+       0x01000000,     /* direct events set 1: byte 3 bit 0 */
+       0x00010000,     /* direct events set 2: byte 2 bit 0 */
+       0, 0, 0, 0,     /* IDU, IFU, nest: nothing */
+       0x00000088,     /* VMX set 1: byte 0 bits 3, 7 */
+       0x000000c0,     /* VMX set 2: byte 0 bits 4-7 */
+       0x04010000,     /* LSU set 1: byte 2 bit 0, byte 3 bit 2 */
+       0xff010000u,    /* LSU set 2: byte 2 bit 0, all of byte 3 */
+       0,              /* LSU set 3 */
+       0x00000010,     /* VMX set 3: byte 0 bit 4 */
+       0,              /* BFP set 1 */
+       0x00000022,     /* BFP set 2: byte 0 bits 1, 5 */
+       0, 0
+};
+       
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power6_marked_instr_event(u64 event)
+{
+       int pmc, psel, ptype;
+       int bit, byte, unit;
+       u32 mask;
+
+       pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+       psel = (event & PM_PMCSEL_MSK) >> 1;    /* drop edge/level bit */
+       if (pmc >= 5)
+               return 0;
+
+       bit = -1;
+       if (psel < sizeof(direct_event_is_marked)) {
+               ptype = direct_event_is_marked[psel];
+               if (pmc == 0 || !(ptype & (1 << (pmc - 1))))
+                       return 0;
+               ptype >>= 4;
+               if (ptype == 0)
+                       return 1;
+               if (ptype == 1)
+                       bit = 0;
+               else
+                       bit = ptype ^ (pmc - 1);
+       } else if ((psel & 0x48) == 0x40)
+               bit = psel & 7;
+
+       if (!(event & PM_BUSEVENT_MSK) || bit == -1)
+               return 0;
+
+       byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+       unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+       mask = marked_bus_events[unit];
+       return (mask >> (byte * 8 + bit)) & 1;
+}
+
+/*
+ * Assign PMC numbers and compute MMCR1 value for a set of events
+ */
+static int p6_compute_mmcr(u64 event[], int n_ev,
+                          unsigned int hwc[], u64 mmcr[])
+{
+       u64 mmcr1 = 0;
+       u64 mmcra = 0;
+       int i;
+       unsigned int pmc, ev, b, u, s, psel;
+       unsigned int ttmset = 0;
+       unsigned int pmc_inuse = 0;
+
+       if (n_ev > 6)
+               return -1;
+       for (i = 0; i < n_ev; ++i) {
+               pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+               if (pmc) {
+                       if (pmc_inuse & (1 << (pmc - 1)))
+                               return -1;      /* collision! */
+                       pmc_inuse |= 1 << (pmc - 1);
+               }
+       }
+       for (i = 0; i < n_ev; ++i) {
+               ev = event[i];
+               pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK;
+               if (pmc) {
+                       --pmc;
+               } else {
+                       /* can go on any PMC; find a free one */
+                       for (pmc = 0; pmc < 4; ++pmc)
+                               if (!(pmc_inuse & (1 << pmc)))
+                                       break;
+                       if (pmc >= 4)
+                               return -1;
+                       pmc_inuse |= 1 << pmc;
+               }
+               hwc[i] = pmc;
+               psel = ev & PM_PMCSEL_MSK;
+               if (ev & PM_BUSEVENT_MSK) {
+                       /* this event uses the event bus */
+                       b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK;
+                       u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK;
+                       /* check for conflict on this byte of event bus */
+                       if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u)
+                               return -1;
+                       mmcr1 |= (u64)u << MMCR1_TTMSEL_SH(b);
+                       ttmset |= 1 << b;
+                       if (u == 5) {
+                               /* Nest events have a further mux */
+                               s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
+                               if ((ttmset & 0x10) &&
+                                   MMCR1_NESTSEL(mmcr1) != s)
+                                       return -1;
+                               ttmset |= 0x10;
+                               mmcr1 |= (u64)s << MMCR1_NESTSEL_SH;
+                       }
+                       if (0x30 <= psel && psel <= 0x3d) {
+                               /* these need the PMCx_ADDR_SEL bits */
+                               if (b >= 2)
+                                       mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc;
+                       }
+                       /* bus select values are different for PMC3/4 */
+                       if (pmc >= 2 && (psel & 0x90) == 0x80)
+                               psel ^= 0x20;
+               }
+               if (ev & PM_LLA) {
+                       mmcr1 |= MMCR1_PMC1_LLA >> pmc;
+                       if (ev & PM_LLAV)
+                               mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc;
+               }
+               if (power6_marked_instr_event(event[i]))
+                       mmcra |= MMCRA_SAMPLE_ENABLE;
+               if (pmc < 4)
+                       mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
+       }
+       mmcr[0] = 0;
+       if (pmc_inuse & 1)
+               mmcr[0] = MMCR0_PMC1CE;
+       if (pmc_inuse & 0xe)
+               mmcr[0] |= MMCR0_PMCjCE;
+       mmcr[1] = mmcr1;
+       mmcr[2] = mmcra;
+       return 0;
+}
+
+/*
+ * Layout of constraint bits:
+ *
+ *     0-1     add field: number of uses of PMC1 (max 1)
+ *     2-3, 4-5, 6-7, 8-9, 10-11: ditto for PMC2, 3, 4, 5, 6
+ *     12-15   add field: number of uses of PMC1-4 (max 4)
+ *     16-19   select field: unit on byte 0 of event bus
+ *     20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
+ *     32-34   select field: nest (subunit) event selector
+ */
+static int p6_get_constraint(u64 event, u64 *maskp, u64 *valp)
+{
+       int pmc, byte, sh, subunit;
+       u64 mask = 0, value = 0;
+
+       pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+       if (pmc) {
+               if (pmc > 4 && !(event == 0x500009 || event == 0x600005))
+                       return -1;
+               sh = (pmc - 1) * 2;
+               mask |= 2 << sh;
+               value |= 1 << sh;
+       }
+       if (event & PM_BUSEVENT_MSK) {
+               byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+               sh = byte * 4 + (16 - PM_UNIT_SH);
+               mask |= PM_UNIT_MSKS << sh;
+               value |= (u64)(event & PM_UNIT_MSKS) << sh;
+               if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) {
+                       subunit = (event >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
+                       mask  |= (u64)PM_SUBUNIT_MSK << 32;
+                       value |= (u64)subunit << 32;
+               }
+       }
+       if (pmc <= 4) {
+               mask  |= 0x8000;        /* add field for count of PMC1-4 uses */
+               value |= 0x1000;
+       }
+       *maskp = mask;
+       *valp = value;
+       return 0;
+}
+
+static int p6_limited_pmc_event(u64 event)
+{
+       int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+
+       return pmc == 5 || pmc == 6;
+}
+
+#define MAX_ALT        4       /* at most 4 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+       { 0x0130e8, 0x2000f6, 0x3000fc },       /* PM_PTEG_RELOAD_VALID */
+       { 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */
+       { 0x080088, 0x200054, 0x3000f0 },       /* PM_ST_MISS_L1 */
+       { 0x10000a, 0x2000f4, 0x600005 },       /* PM_RUN_CYC */
+       { 0x10000b, 0x2000f5 },                 /* PM_RUN_COUNT */
+       { 0x10000e, 0x400010 },                 /* PM_PURR */
+       { 0x100010, 0x4000f8 },                 /* PM_FLUSH */
+       { 0x10001a, 0x200010 },                 /* PM_MRK_INST_DISP */
+       { 0x100026, 0x3000f8 },                 /* PM_TB_BIT_TRANS */
+       { 0x100054, 0x2000f0 },                 /* PM_ST_FIN */
+       { 0x100056, 0x2000fc },                 /* PM_L1_ICACHE_MISS */
+       { 0x1000f0, 0x40000a },                 /* PM_INST_IMC_MATCH_CMPL */
+       { 0x1000f8, 0x200008 },                 /* PM_GCT_EMPTY_CYC */
+       { 0x1000fc, 0x400006 },                 /* PM_LSU_DERAT_MISS_CYC */
+       { 0x20000e, 0x400007 },                 /* PM_LSU_DERAT_MISS */
+       { 0x200012, 0x300012 },                 /* PM_INST_DISP */
+       { 0x2000f2, 0x3000f2 },                 /* PM_INST_DISP */
+       { 0x2000f8, 0x300010 },                 /* PM_EXT_INT */
+       { 0x2000fe, 0x300056 },                 /* PM_DATA_FROM_L2MISS */
+       { 0x2d0030, 0x30001a },                 /* PM_MRK_FPU_FIN */
+       { 0x30000a, 0x400018 },                 /* PM_MRK_INST_FIN */
+       { 0x3000f6, 0x40000e },                 /* PM_L1_DCACHE_RELOAD_VALID */
+       { 0x3000fe, 0x400056 },                 /* PM_DATA_FROM_L3MISS */
+};
+
+/*
+ * This could be made more efficient with a binary search on
+ * a presorted list, if necessary
+ */
+static int find_alternatives_list(u64 event)
+{
+       int i, j;
+       unsigned int alt;
+
+       for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+               if (event < event_alternatives[i][0])
+                       return -1;
+               for (j = 0; j < MAX_ALT; ++j) {
+                       alt = event_alternatives[i][j];
+                       if (!alt || event < alt)
+                               break;
+                       if (event == alt)
+                               return i;
+               }
+       }
+       return -1;
+}
+
+static int p6_get_alternatives(u64 event, unsigned int flags, u64 alt[])
+{
+       int i, j, nlim;
+       unsigned int psel, pmc;
+       unsigned int nalt = 1;
+       u64 aevent;
+
+       alt[0] = event;
+       nlim = p6_limited_pmc_event(event);
+
+       /* check the alternatives table */
+       i = find_alternatives_list(event);
+       if (i >= 0) {
+               /* copy out alternatives from list */
+               for (j = 0; j < MAX_ALT; ++j) {
+                       aevent = event_alternatives[i][j];
+                       if (!aevent)
+                               break;
+                       if (aevent != event)
+                               alt[nalt++] = aevent;
+                       nlim += p6_limited_pmc_event(aevent);
+               }
+
+       } else {
+               /* Check for alternative ways of computing sum events */
+               /* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */
+               psel = event & (PM_PMCSEL_MSK & ~1);    /* ignore edge bit */
+               pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+               if (pmc && (psel == 0x32 || psel == 0x34))
+                       alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) |
+                               ((5 - pmc) << PM_PMC_SH);
+
+               /* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */
+               if (pmc && (psel == 0x38 || psel == 0x3a))
+                       alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) |
+                               ((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH);
+       }
+
+       if (flags & PPMU_ONLY_COUNT_RUN) {
+               /*
+                * We're only counting in RUN state,
+                * so PM_CYC is equivalent to PM_RUN_CYC,
+                * PM_INST_CMPL === PM_RUN_INST_CMPL, PM_PURR === PM_RUN_PURR.
+                * This doesn't include alternatives that don't provide
+                * any extra flexibility in assigning PMCs (e.g.
+                * 0x10000a for PM_RUN_CYC vs. 0x1e for PM_CYC).
+                * Note that even with these additional alternatives
+                * we never end up with more than 4 alternatives for any event.
+                */
+               j = nalt;
+               for (i = 0; i < nalt; ++i) {
+                       switch (alt[i]) {
+                       case 0x1e:      /* PM_CYC */
+                               alt[j++] = 0x600005;    /* PM_RUN_CYC */
+                               ++nlim;
+                               break;
+                       case 0x10000a:  /* PM_RUN_CYC */
+                               alt[j++] = 0x1e;        /* PM_CYC */
+                               break;
+                       case 2:         /* PM_INST_CMPL */
+                               alt[j++] = 0x500009;    /* PM_RUN_INST_CMPL */
+                               ++nlim;
+                               break;
+                       case 0x500009:  /* PM_RUN_INST_CMPL */
+                               alt[j++] = 2;           /* PM_INST_CMPL */
+                               break;
+                       case 0x10000e:  /* PM_PURR */
+                               alt[j++] = 0x4000f4;    /* PM_RUN_PURR */
+                               break;
+                       case 0x4000f4:  /* PM_RUN_PURR */
+                               alt[j++] = 0x10000e;    /* PM_PURR */
+                               break;
+                       }
+               }
+               nalt = j;
+       }
+
+       if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
+               /* remove the limited PMC events */
+               j = 0;
+               for (i = 0; i < nalt; ++i) {
+                       if (!p6_limited_pmc_event(alt[i])) {
+                               alt[j] = alt[i];
+                               ++j;
+                       }
+               }
+               nalt = j;
+       } else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
+               /* remove all but the limited PMC events */
+               j = 0;
+               for (i = 0; i < nalt; ++i) {
+                       if (p6_limited_pmc_event(alt[i])) {
+                               alt[j] = alt[i];
+                               ++j;
+                       }
+               }
+               nalt = j;
+       }
+
+       return nalt;
+}
+
+static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+       /* Set PMCxSEL to 0 to disable PMCx */
+       if (pmc <= 3)
+               mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power6_generic_events[] = {
+       [PERF_COUNT_HW_CPU_CYCLES]              = 0x1e,
+       [PERF_COUNT_HW_INSTRUCTIONS]            = 2,
+       [PERF_COUNT_HW_CACHE_REFERENCES]        = 0x280030, /* LD_REF_L1 */
+       [PERF_COUNT_HW_CACHE_MISSES]            = 0x30000c, /* LD_MISS_L1 */
+       [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]     = 0x410a0,  /* BR_PRED */
+       [PERF_COUNT_HW_BRANCH_MISSES]           = 0x400052, /* BR_MPRED */
+};
+
+#define C(x)   PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ * The "DTLB" and "ITLB" events relate to the DERAT and IERAT.
+ */
+static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+       [C(L1D)] = {            /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0x80082,        0x80080         },
+               [C(OP_WRITE)] = {       0x80086,        0x80088         },
+               [C(OP_PREFETCH)] = {    0x810a4,        0               },
+       },
+       [C(L1I)] = {            /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0x100056        },
+               [C(OP_WRITE)] = {       -1,             -1              },
+               [C(OP_PREFETCH)] = {    0x4008c,        0               },
+       },
+       [C(LL)] = {             /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0x150730,       0x250532        },
+               [C(OP_WRITE)] = {       0x250432,       0x150432        },
+               [C(OP_PREFETCH)] = {    0x810a6,        0               },
+       },
+       [C(DTLB)] = {           /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0x20000e        },
+               [C(OP_WRITE)] = {       -1,             -1              },
+               [C(OP_PREFETCH)] = {    -1,             -1              },
+       },
+       [C(ITLB)] = {           /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0x420ce         },
+               [C(OP_WRITE)] = {       -1,             -1              },
+               [C(OP_PREFETCH)] = {    -1,             -1              },
+       },
+       [C(BPU)] = {            /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0x430e6,        0x400052        },
+               [C(OP_WRITE)] = {       -1,             -1              },
+               [C(OP_PREFETCH)] = {    -1,             -1              },
+       },
+};
+
+struct power_pmu power6_pmu = {
+       .n_counter = 6,
+       .max_alternatives = MAX_ALT,
+       .add_fields = 0x1555,
+       .test_adder = 0x3000,
+       .compute_mmcr = p6_compute_mmcr,
+       .get_constraint = p6_get_constraint,
+       .get_alternatives = p6_get_alternatives,
+       .disable_pmc = p6_disable_pmc,
+       .limited_pmc_event = p6_limited_pmc_event,
+       .flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR,
+       .n_generic = ARRAY_SIZE(power6_generic_events),
+       .generic_events = power6_generic_events,
+       .cache_events = &power6_cache_events,
+};
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c

new file mode 100644 (file)

index 0000000..b3f7d12
--- /dev/null
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -0,0 +1,357 @@
+/*
+ * Performance counter support for POWER7 processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER7
+ */
+#define PM_PMC_SH      16      /* PMC number (1-based) for direct events */
+#define PM_PMC_MSK     0xf
+#define PM_PMC_MSKS    (PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH     12      /* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK    0xf
+#define PM_COMBINE_SH  11      /* Combined event bit */
+#define PM_COMBINE_MSK 1
+#define PM_COMBINE_MSKS        0x800
+#define PM_L2SEL_SH    8       /* L2 event select */
+#define PM_L2SEL_MSK   7
+#define PM_PMCSEL_MSK  0xff
+
+/*
+ * Bits in MMCR1 for POWER7
+ */
+#define MMCR1_TTM0SEL_SH       60
+#define MMCR1_TTM1SEL_SH       56
+#define MMCR1_TTM2SEL_SH       52
+#define MMCR1_TTM3SEL_SH       48
+#define MMCR1_TTMSEL_MSK       0xf
+#define MMCR1_L2SEL_SH         45
+#define MMCR1_L2SEL_MSK                7
+#define MMCR1_PMC1_COMBINE_SH  35
+#define MMCR1_PMC2_COMBINE_SH  34
+#define MMCR1_PMC3_COMBINE_SH  33
+#define MMCR1_PMC4_COMBINE_SH  32
+#define MMCR1_PMC1SEL_SH       24
+#define MMCR1_PMC2SEL_SH       16
+#define MMCR1_PMC3SEL_SH       8
+#define MMCR1_PMC4SEL_SH       0
+#define MMCR1_PMCSEL_SH(n)     (MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK       0xff
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *                                                 [  ><><><><><><>
+ *                                                  NC P6P5P4P3P2P1
+ *
+ * NC - number of counters
+ *     15: NC error 0x8000
+ *     12-14: number of events needing PMC1-4 0x7000
+ *
+ * P6
+ *     11: P6 error 0x800
+ *     10-11: Count of events needing PMC6
+ *
+ * P1..P5
+ *     0-9: Count of events needing PMC1..PMC5
+ */
+
+static int power7_get_constraint(u64 event, u64 *maskp, u64 *valp)
+{
+       int pmc, sh;
+       u64 mask = 0, value = 0;
+
+       pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+       if (pmc) {
+               if (pmc > 6)
+                       return -1;
+               sh = (pmc - 1) * 2;
+               mask |= 2 << sh;
+               value |= 1 << sh;
+               if (pmc >= 5 && !(event == 0x500fa || event == 0x600f4))
+                       return -1;
+       }
+       if (pmc < 5) {
+               /* need a counter from PMC1-4 set */
+               mask  |= 0x8000;
+               value |= 0x1000;
+       }
+       *maskp = mask;
+       *valp = value;
+       return 0;
+}
+
+#define MAX_ALT        2       /* at most 2 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+       { 0x200f2, 0x300f2 },           /* PM_INST_DISP */
+       { 0x200f4, 0x600f4 },           /* PM_RUN_CYC */
+       { 0x400fa, 0x500fa },           /* PM_RUN_INST_CMPL */
+};
+
+/*
+ * Scan the alternatives table for a match and return the
+ * index into the alternatives table if found, else -1.
+ */
+static int find_alternative(u64 event)
+{
+       int i, j;
+
+       for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+               if (event < event_alternatives[i][0])
+                       break;
+               for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
+                       if (event == event_alternatives[i][j])
+                               return i;
+       }
+       return -1;
+}
+
+static s64 find_alternative_decode(u64 event)
+{
+       int pmc, psel;
+
+       /* this only handles the 4x decode events */
+       pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+       psel = event & PM_PMCSEL_MSK;
+       if ((pmc == 2 || pmc == 4) && (psel & ~7) == 0x40)
+               return event - (1 << PM_PMC_SH) + 8;
+       if ((pmc == 1 || pmc == 3) && (psel & ~7) == 0x48)
+               return event + (1 << PM_PMC_SH) - 8;
+       return -1;
+}
+
+static int power7_get_alternatives(u64 event, unsigned int flags, u64 alt[])
+{
+       int i, j, nalt = 1;
+       s64 ae;
+
+       alt[0] = event;
+       nalt = 1;
+       i = find_alternative(event);
+       if (i >= 0) {
+               for (j = 0; j < MAX_ALT; ++j) {
+                       ae = event_alternatives[i][j];
+                       if (ae && ae != event)
+                               alt[nalt++] = ae;
+               }
+       } else {
+               ae = find_alternative_decode(event);
+               if (ae > 0)
+                       alt[nalt++] = ae;
+       }
+
+       if (flags & PPMU_ONLY_COUNT_RUN) {
+               /*
+                * We're only counting in RUN state,
+                * so PM_CYC is equivalent to PM_RUN_CYC
+                * and PM_INST_CMPL === PM_RUN_INST_CMPL.
+                * This doesn't include alternatives that don't provide
+                * any extra flexibility in assigning PMCs.
+                */
+               j = nalt;
+               for (i = 0; i < nalt; ++i) {
+                       switch (alt[i]) {
+                       case 0x1e:      /* PM_CYC */
+                               alt[j++] = 0x600f4;     /* PM_RUN_CYC */
+                               break;
+                       case 0x600f4:   /* PM_RUN_CYC */
+                               alt[j++] = 0x1e;
+                               break;
+                       case 0x2:       /* PM_PPC_CMPL */
+                               alt[j++] = 0x500fa;     /* PM_RUN_INST_CMPL */
+                               break;
+                       case 0x500fa:   /* PM_RUN_INST_CMPL */
+                               alt[j++] = 0x2; /* PM_PPC_CMPL */
+                               break;
+                       }
+               }
+               nalt = j;
+       }
+
+       return nalt;
+}
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power7_marked_instr_event(u64 event)
+{
+       int pmc, psel;
+       int unit;
+
+       pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+       unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+       psel = event & PM_PMCSEL_MSK & ~1;      /* trim off edge/level bit */
+       if (pmc >= 5)
+               return 0;
+
+       switch (psel >> 4) {
+       case 2:
+               return pmc == 2 || pmc == 4;
+       case 3:
+               if (psel == 0x3c)
+                       return pmc == 1;
+               if (psel == 0x3e)
+                       return pmc != 2;
+               return 1;
+       case 4:
+       case 5:
+               return unit == 0xd;
+       case 6:
+               if (psel == 0x64)
+                       return pmc >= 3;
+       case 8:
+               return unit == 0xd;
+       }
+       return 0;
+}
+
+static int power7_compute_mmcr(u64 event[], int n_ev,
+                              unsigned int hwc[], u64 mmcr[])
+{
+       u64 mmcr1 = 0;
+       u64 mmcra = 0;
+       unsigned int pmc, unit, combine, l2sel, psel;
+       unsigned int pmc_inuse = 0;
+       int i;
+
+       /* First pass to count resource use */
+       for (i = 0; i < n_ev; ++i) {
+               pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+               if (pmc) {
+                       if (pmc > 6)
+                               return -1;
+                       if (pmc_inuse & (1 << (pmc - 1)))
+                               return -1;
+                       pmc_inuse |= 1 << (pmc - 1);
+               }
+       }
+
+       /* Second pass: assign PMCs, set all MMCR1 fields */
+       for (i = 0; i < n_ev; ++i) {
+               pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+               unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+               combine = (event[i] >> PM_COMBINE_SH) & PM_COMBINE_MSK;
+               l2sel = (event[i] >> PM_L2SEL_SH) & PM_L2SEL_MSK;
+               psel = event[i] & PM_PMCSEL_MSK;
+               if (!pmc) {
+                       /* Bus event or any-PMC direct event */
+                       for (pmc = 0; pmc < 4; ++pmc) {
+                               if (!(pmc_inuse & (1 << pmc)))
+                                       break;
+                       }
+                       if (pmc >= 4)
+                               return -1;
+                       pmc_inuse |= 1 << pmc;
+               } else {
+                       /* Direct or decoded event */
+                       --pmc;
+               }
+               if (pmc <= 3) {
+                       mmcr1 |= (u64) unit << (MMCR1_TTM0SEL_SH - 4 * pmc);
+                       mmcr1 |= (u64) combine << (MMCR1_PMC1_COMBINE_SH - pmc);
+                       mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
+                       if (unit == 6)  /* L2 events */
+                               mmcr1 |= (u64) l2sel << MMCR1_L2SEL_SH;
+               }
+               if (power7_marked_instr_event(event[i]))
+                       mmcra |= MMCRA_SAMPLE_ENABLE;
+               hwc[i] = pmc;
+       }
+
+       /* Return MMCRx values */
+       mmcr[0] = 0;
+       if (pmc_inuse & 1)
+               mmcr[0] = MMCR0_PMC1CE;
+       if (pmc_inuse & 0x3e)
+               mmcr[0] |= MMCR0_PMCjCE;
+       mmcr[1] = mmcr1;
+       mmcr[2] = mmcra;
+       return 0;
+}
+
+static void power7_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+       if (pmc <= 3)
+               mmcr[1] &= ~(0xffULL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power7_generic_events[] = {
+       [PERF_COUNT_CPU_CYCLES] = 0x1e,
+       [PERF_COUNT_INSTRUCTIONS] = 2,
+       [PERF_COUNT_CACHE_REFERENCES] = 0xc880,         /* LD_REF_L1_LSU */
+       [PERF_COUNT_CACHE_MISSES] = 0x400f0,            /* LD_MISS_L1 */
+       [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x10068,     /* BRU_FIN */
+       [PERF_COUNT_BRANCH_MISSES] = 0x400f6,           /* BR_MPRED */
+};
+
+#define C(x)   PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+       [C(L1D)] = {            /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0x400f0,        0xc880  },
+               [C(OP_WRITE)] = {       0,              0x300f0 },
+               [C(OP_PREFETCH)] = {    0xd8b8,         0       },
+       },
+       [C(L1I)] = {            /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0x200fc },
+               [C(OP_WRITE)] = {       -1,             -1      },
+               [C(OP_PREFETCH)] = {    0x408a,         0       },
+       },
+       [C(LL)] = {             /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0x6080,         0x6084  },
+               [C(OP_WRITE)] = {       0x6082,         0x6086  },
+               [C(OP_PREFETCH)] = {    0,              0       },
+       },
+       [C(DTLB)] = {           /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0x300fc },
+               [C(OP_WRITE)] = {       -1,             -1      },
+               [C(OP_PREFETCH)] = {    -1,             -1      },
+       },
+       [C(ITLB)] = {           /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0x400fc },
+               [C(OP_WRITE)] = {       -1,             -1      },
+               [C(OP_PREFETCH)] = {    -1,             -1      },
+       },
+       [C(BPU)] = {            /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0x10068,        0x400f6 },
+               [C(OP_WRITE)] = {       -1,             -1      },
+               [C(OP_PREFETCH)] = {    -1,             -1      },
+       },
+};
+
+struct power_pmu power7_pmu = {
+       .n_counter = 6,
+       .max_alternatives = MAX_ALT + 1,
+       .add_fields = 0x1555ull,
+       .test_adder = 0x3000ull,
+       .compute_mmcr = power7_compute_mmcr,
+       .get_constraint = power7_get_constraint,
+       .get_alternatives = power7_get_alternatives,
+       .disable_pmc = power7_disable_pmc,
+       .n_generic = ARRAY_SIZE(power7_generic_events),
+       .generic_events = power7_generic_events,
+       .cache_events = &power7_cache_events,
+};
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c

new file mode 100644 (file)

index 0000000..ba0a357
--- /dev/null
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -0,0 +1,482 @@
+/*
+ * Performance counter support for PPC970-family processors.
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/string.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for PPC970
+ */
+#define PM_PMC_SH      12      /* PMC number (1-based) for direct events */
+#define PM_PMC_MSK     0xf
+#define PM_UNIT_SH     8       /* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK    0xf
+#define PM_SPCSEL_SH   6
+#define PM_SPCSEL_MSK  3
+#define PM_BYTE_SH     4       /* Byte number of event bus to use */
+#define PM_BYTE_MSK    3
+#define PM_PMCSEL_MSK  0xf
+
+/* Values in PM_UNIT field */
+#define PM_NONE                0
+#define PM_FPU         1
+#define PM_VPU         2
+#define PM_ISU         3
+#define PM_IFU         4
+#define PM_IDU         5
+#define PM_STS         6
+#define PM_LSU0                7
+#define PM_LSU1U       8
+#define PM_LSU1L       9
+#define PM_LASTUNIT    9
+
+/*
+ * Bits in MMCR0 for PPC970
+ */
+#define MMCR0_PMC1SEL_SH       8
+#define MMCR0_PMC2SEL_SH       1
+#define MMCR_PMCSEL_MSK                0x1f
+
+/*
+ * Bits in MMCR1 for PPC970
+ */
+#define MMCR1_TTM0SEL_SH       62
+#define MMCR1_TTM1SEL_SH       59
+#define MMCR1_TTM3SEL_SH       53
+#define MMCR1_TTMSEL_MSK       3
+#define MMCR1_TD_CP_DBG0SEL_SH 50
+#define MMCR1_TD_CP_DBG1SEL_SH 48
+#define MMCR1_TD_CP_DBG2SEL_SH 46
+#define MMCR1_TD_CP_DBG3SEL_SH 44
+#define MMCR1_PMC1_ADDER_SEL_SH        39
+#define MMCR1_PMC2_ADDER_SEL_SH        38
+#define MMCR1_PMC6_ADDER_SEL_SH        37
+#define MMCR1_PMC5_ADDER_SEL_SH        36
+#define MMCR1_PMC8_ADDER_SEL_SH        35
+#define MMCR1_PMC7_ADDER_SEL_SH        34
+#define MMCR1_PMC3_ADDER_SEL_SH        33
+#define MMCR1_PMC4_ADDER_SEL_SH        32
+#define MMCR1_PMC3SEL_SH       27
+#define MMCR1_PMC4SEL_SH       22
+#define MMCR1_PMC5SEL_SH       17
+#define MMCR1_PMC6SEL_SH       12
+#define MMCR1_PMC7SEL_SH       7
+#define MMCR1_PMC8SEL_SH       2
+
+static short mmcr1_adder_bits[8] = {
+       MMCR1_PMC1_ADDER_SEL_SH,
+       MMCR1_PMC2_ADDER_SEL_SH,
+       MMCR1_PMC3_ADDER_SEL_SH,
+       MMCR1_PMC4_ADDER_SEL_SH,
+       MMCR1_PMC5_ADDER_SEL_SH,
+       MMCR1_PMC6_ADDER_SEL_SH,
+       MMCR1_PMC7_ADDER_SEL_SH,
+       MMCR1_PMC8_ADDER_SEL_SH
+};
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *               <><><>[  >[  >[  ><  ><  ><  ><  ><><><><><><><><>
+ *               SPT0T1 UC  PS1 PS2 B0  B1  B2  B3 P1P2P3P4P5P6P7P8
+ *
+ * SP - SPCSEL constraint
+ *     48-49: SPCSEL value 0x3_0000_0000_0000
+ *
+ * T0 - TTM0 constraint
+ *     46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000
+ *
+ * T1 - TTM1 constraint
+ *     44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000
+ *
+ * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS
+ *     43: UC3 error 0x0800_0000_0000
+ *     42: FPU|IFU|VPU events needed 0x0400_0000_0000
+ *     41: ISU events needed 0x0200_0000_0000
+ *     40: IDU|STS events needed 0x0100_0000_0000
+ *
+ * PS1
+ *     39: PS1 error 0x0080_0000_0000
+ *     36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
+ *
+ * PS2
+ *     35: PS2 error 0x0008_0000_0000
+ *     32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
+ *
+ * B0
+ *     28-31: Byte 0 event source 0xf000_0000
+ *           Encoding as for the event code
+ *
+ * B1, B2, B3
+ *     24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
+ *
+ * P1
+ *     15: P1 error 0x8000
+ *     14-15: Count of events needing PMC1
+ *
+ * P2..P8
+ *     0-13: Count of events needing PMC2..PMC8
+ */
+
+static unsigned char direct_marked_event[8] = {
+       (1<<2) | (1<<3),        /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
+       (1<<3) | (1<<5),        /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
+       (1<<3) | (1<<5),        /* PMC3: PM_MRK_ST_CMPL_INT, PM_MRK_VMX_FIN */
+       (1<<4) | (1<<5),        /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
+       (1<<4) | (1<<5),        /* PMC5: PM_GRP_MRK, PM_MRK_GRP_TIMEO */
+       (1<<3) | (1<<4) | (1<<5),
+               /* PMC6: PM_MRK_ST_STS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
+       (1<<4) | (1<<5),        /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
+       (1<<4)                  /* PMC8: PM_MRK_LSU_FIN */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int p970_marked_instr_event(u64 event)
+{
+       int pmc, psel, unit, byte, bit;
+       unsigned int mask;
+
+       pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+       psel = event & PM_PMCSEL_MSK;
+       if (pmc) {
+               if (direct_marked_event[pmc - 1] & (1 << psel))
+                       return 1;
+               if (psel == 0)          /* add events */
+                       bit = (pmc <= 4)? pmc - 1: 8 - pmc;
+               else if (psel == 7 || psel == 13)       /* decode events */
+                       bit = 4;
+               else
+                       return 0;
+       } else
+               bit = psel;
+
+       byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+       unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+       mask = 0;
+       switch (unit) {
+       case PM_VPU:
+               mask = 0x4c;            /* byte 0 bits 2,3,6 */
+       case PM_LSU0:
+               /* byte 2 bits 0,2,3,4,6; all of byte 1 */
+               mask = 0x085dff00;
+       case PM_LSU1L:
+               mask = 0x50 << 24;      /* byte 3 bits 4,6 */
+               break;
+       }
+       return (mask >> (byte * 8 + bit)) & 1;
+}
+
+/* Masks and values for using events from the various units */
+static u64 unit_cons[PM_LASTUNIT+1][2] = {
+       [PM_FPU] =   { 0xc80000000000ull, 0x040000000000ull },
+       [PM_VPU] =   { 0xc80000000000ull, 0xc40000000000ull },
+       [PM_ISU] =   { 0x080000000000ull, 0x020000000000ull },
+       [PM_IFU] =   { 0xc80000000000ull, 0x840000000000ull },
+       [PM_IDU] =   { 0x380000000000ull, 0x010000000000ull },
+       [PM_STS] =   { 0x380000000000ull, 0x310000000000ull },
+};
+
+static int p970_get_constraint(u64 event, u64 *maskp, u64 *valp)
+{
+       int pmc, byte, unit, sh, spcsel;
+       u64 mask = 0, value = 0;
+       int grp = -1;
+
+       pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+       if (pmc) {
+               if (pmc > 8)
+                       return -1;
+               sh = (pmc - 1) * 2;
+               mask |= 2 << sh;
+               value |= 1 << sh;
+               grp = ((pmc - 1) >> 1) & 1;
+       }
+       unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+       if (unit) {
+               if (unit > PM_LASTUNIT)
+                       return -1;
+               mask |= unit_cons[unit][0];
+               value |= unit_cons[unit][1];
+               byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+               /*
+                * Bus events on bytes 0 and 2 can be counted
+                * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
+                */
+               if (!pmc)
+                       grp = byte & 1;
+               /* Set byte lane select field */
+               mask  |= 0xfULL << (28 - 4 * byte);
+               value |= (u64)unit << (28 - 4 * byte);
+       }
+       if (grp == 0) {
+               /* increment PMC1/2/5/6 field */
+               mask  |= 0x8000000000ull;
+               value |= 0x1000000000ull;
+       } else if (grp == 1) {
+               /* increment PMC3/4/7/8 field */
+               mask  |= 0x800000000ull;
+               value |= 0x100000000ull;
+       }
+       spcsel = (event >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
+       if (spcsel) {
+               mask  |= 3ull << 48;
+               value |= (u64)spcsel << 48;
+       }
+       *maskp = mask;
+       *valp = value;
+       return 0;
+}
+
+static int p970_get_alternatives(u64 event, unsigned int flags, u64 alt[])
+{
+       alt[0] = event;
+
+       /* 2 alternatives for LSU empty */
+       if (event == 0x2002 || event == 0x3002) {
+               alt[1] = event ^ 0x1000;
+               return 2;
+       }
+               
+       return 1;
+}
+
+static int p970_compute_mmcr(u64 event[], int n_ev,
+                            unsigned int hwc[], u64 mmcr[])
+{
+       u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
+       unsigned int pmc, unit, byte, psel;
+       unsigned int ttm, grp;
+       unsigned int pmc_inuse = 0;
+       unsigned int pmc_grp_use[2];
+       unsigned char busbyte[4];
+       unsigned char unituse[16];
+       unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 };
+       unsigned char ttmuse[2];
+       unsigned char pmcsel[8];
+       int i;
+       int spcsel;
+
+       if (n_ev > 8)
+               return -1;
+
+       /* First pass to count resource use */
+       pmc_grp_use[0] = pmc_grp_use[1] = 0;
+       memset(busbyte, 0, sizeof(busbyte));
+       memset(unituse, 0, sizeof(unituse));
+       for (i = 0; i < n_ev; ++i) {
+               pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+               if (pmc) {
+                       if (pmc_inuse & (1 << (pmc - 1)))
+                               return -1;
+                       pmc_inuse |= 1 << (pmc - 1);
+                       /* count 1/2/5/6 vs 3/4/7/8 use */
+                       ++pmc_grp_use[((pmc - 1) >> 1) & 1];
+               }
+               unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+               byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+               if (unit) {
+                       if (unit > PM_LASTUNIT)
+                               return -1;
+                       if (!pmc)
+                               ++pmc_grp_use[byte & 1];
+                       if (busbyte[byte] && busbyte[byte] != unit)
+                               return -1;
+                       busbyte[byte] = unit;
+                       unituse[unit] = 1;
+               }
+       }
+       if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
+               return -1;
+
+       /*
+        * Assign resources and set multiplexer selects.
+        *
+        * PM_ISU can go either on TTM0 or TTM1, but that's the only
+        * choice we have to deal with.
+        */
+       if (unituse[PM_ISU] &
+           (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU]))
+               unitmap[PM_ISU] = 2 | 4;        /* move ISU to TTM1 */
+       /* Set TTM[01]SEL fields. */
+       ttmuse[0] = ttmuse[1] = 0;
+       for (i = PM_FPU; i <= PM_STS; ++i) {
+               if (!unituse[i])
+                       continue;
+               ttm = unitmap[i];
+               ++ttmuse[(ttm >> 2) & 1];
+               mmcr1 |= (u64)(ttm & ~4) << MMCR1_TTM1SEL_SH;
+       }
+       /* Check only one unit per TTMx */
+       if (ttmuse[0] > 1 || ttmuse[1] > 1)
+               return -1;
+
+       /* Set byte lane select fields and TTM3SEL. */
+       for (byte = 0; byte < 4; ++byte) {
+               unit = busbyte[byte];
+               if (!unit)
+                       continue;
+               if (unit <= PM_STS)
+                       ttm = (unitmap[unit] >> 2) & 1;
+               else if (unit == PM_LSU0)
+                       ttm = 2;
+               else {
+                       ttm = 3;
+                       if (unit == PM_LSU1L && byte >= 2)
+                               mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
+               }
+               mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
+       }
+
+       /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+       memset(pmcsel, 0x8, sizeof(pmcsel));    /* 8 means don't count */
+       for (i = 0; i < n_ev; ++i) {
+               pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+               unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+               byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+               psel = event[i] & PM_PMCSEL_MSK;
+               if (!pmc) {
+                       /* Bus event or any-PMC direct event */
+                       if (unit)
+                               psel |= 0x10 | ((byte & 2) << 2);
+                       else
+                               psel |= 8;
+                       for (pmc = 0; pmc < 8; ++pmc) {
+                               if (pmc_inuse & (1 << pmc))
+                                       continue;
+                               grp = (pmc >> 1) & 1;
+                               if (unit) {
+                                       if (grp == (byte & 1))
+                                               break;
+                               } else if (pmc_grp_use[grp] < 4) {
+                                       ++pmc_grp_use[grp];
+                                       break;
+                               }
+                       }
+                       pmc_inuse |= 1 << pmc;
+               } else {
+                       /* Direct event */
+                       --pmc;
+                       if (psel == 0 && (byte & 2))
+                               /* add events on higher-numbered bus */
+                               mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
+               }
+               pmcsel[pmc] = psel;
+               hwc[i] = pmc;
+               spcsel = (event[i] >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
+               mmcr1 |= spcsel;
+               if (p970_marked_instr_event(event[i]))
+                       mmcra |= MMCRA_SAMPLE_ENABLE;
+       }
+       for (pmc = 0; pmc < 2; ++pmc)
+               mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc);
+       for (; pmc < 8; ++pmc)
+               mmcr1 |= (u64)pmcsel[pmc] << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
+       if (pmc_inuse & 1)
+               mmcr0 |= MMCR0_PMC1CE;
+       if (pmc_inuse & 0xfe)
+               mmcr0 |= MMCR0_PMCjCE;
+
+       mmcra |= 0x2000;        /* mark only one IOP per PPC instruction */
+
+       /* Return MMCRx values */
+       mmcr[0] = mmcr0;
+       mmcr[1] = mmcr1;
+       mmcr[2] = mmcra;
+       return 0;
+}
+
+static void p970_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+       int shift, i;
+
+       if (pmc <= 1) {
+               shift = MMCR0_PMC1SEL_SH - 7 * pmc;
+               i = 0;
+       } else {
+               shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2);
+               i = 1;
+       }
+       /*
+        * Setting the PMCxSEL field to 0x08 disables PMC x.
+        */
+       mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift);
+}
+
+static int ppc970_generic_events[] = {
+       [PERF_COUNT_HW_CPU_CYCLES]              = 7,
+       [PERF_COUNT_HW_INSTRUCTIONS]            = 1,
+       [PERF_COUNT_HW_CACHE_REFERENCES]        = 0x8810, /* PM_LD_REF_L1 */
+       [PERF_COUNT_HW_CACHE_MISSES]            = 0x3810, /* PM_LD_MISS_L1 */
+       [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]     = 0x431,  /* PM_BR_ISSUED */
+       [PERF_COUNT_HW_BRANCH_MISSES]           = 0x327,  /* PM_GRP_BR_MPRED */
+};
+
+#define C(x)   PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+       [C(L1D)] = {            /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0x8810,         0x3810  },
+               [C(OP_WRITE)] = {       0x7810,         0x813   },
+               [C(OP_PREFETCH)] = {    0x731,          0       },
+       },
+       [C(L1I)] = {            /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0       },
+               [C(OP_WRITE)] = {       -1,             -1      },
+               [C(OP_PREFETCH)] = {    0,              0       },
+       },
+       [C(LL)] = {             /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0       },
+               [C(OP_WRITE)] = {       0,              0       },
+               [C(OP_PREFETCH)] = {    0x733,          0       },
+       },
+       [C(DTLB)] = {           /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0x704   },
+               [C(OP_WRITE)] = {       -1,             -1      },
+               [C(OP_PREFETCH)] = {    -1,             -1      },
+       },
+       [C(ITLB)] = {           /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0,              0x700   },
+               [C(OP_WRITE)] = {       -1,             -1      },
+               [C(OP_PREFETCH)] = {    -1,             -1      },
+       },
+       [C(BPU)] = {            /*      RESULT_ACCESS   RESULT_MISS */
+               [C(OP_READ)] = {        0x431,          0x327   },
+               [C(OP_WRITE)] = {       -1,             -1      },
+               [C(OP_PREFETCH)] = {    -1,             -1      },
+       },
+};
+
+struct power_pmu ppc970_pmu = {
+       .n_counter = 8,
+       .max_alternatives = 2,
+       .add_fields = 0x001100005555ull,
+       .test_adder = 0x013300000000ull,
+       .compute_mmcr = p970_compute_mmcr,
+       .get_constraint = p970_get_constraint,
+       .get_alternatives = p970_get_alternatives,
+       .disable_pmc = p970_disable_pmc,
+       .n_generic = ARRAY_SIZE(ppc970_generic_events),
+       .generic_events = ppc970_generic_events,
+       .cache_events = &ppc970_cache_events,
+};
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c

index 76993941cac95a12f7c95972eb55f9e0b00fe769..5beffc8f481e7c6d725b63c82f3aebeed2bde9c2 100644 (file)
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -29,6 +29,7 @@
  #include <linux/module.h>
  #include <linux/kprobes.h>
  #include <linux/kdebug.h>
+#include <linux/perf_counter.h>
  
  #include <asm/firmware.h>
  #include <asm/page.h>
@@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
                 die("Weird page fault", regs, SIGSEGV);
         }
  
+       perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+
         /* When running in the kernel we expect faults to occur only to
          * addresses in user space.  All other faults represent errors in the
          * kernel and should generate an OOPS.  Unfortunately, in the case of an
@@ -309,6 +312,8 @@ good_area:
         }
         if (ret & VM_FAULT_MAJOR) {
                 current->maj_flt++;
+               perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+                                    regs, address);
  #ifdef CONFIG_PPC_SMLPAR
                 if (firmware_has_feature(FW_FEATURE_CMO)) {
                         preempt_disable();
@@ -316,8 +321,11 @@ good_area:
                         preempt_enable();
                 }
  #endif
-       } else
+       } else {
                 current->min_flt++;
+               perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+                                    regs, address);
+       }
         up_read(&mm->mmap_sem);
         return 0;
  
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype

index 9da795e49337d0ae14e0357ba00aae37d4f41df1..732ee93a8e988fd594eed27614d2034f2b71ed67 100644 (file)
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -1,6 +1,7 @@
  config PPC64
         bool "64-bit kernel"
         default n
+       select HAVE_PERF_COUNTERS
         help
           This option selects whether a 32-bit or a 64-bit kernel
           will be built.
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c

index 80b513449f4c44d3c98915efe233c01caa382f4d..be3581a8c294c8de4d083b1b2c5710381a35c2d9 100644 (file)
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -333,7 +333,7 @@ static void xics_eoi_lpar(unsigned int virq)
         lpar_xirr_info_set((0xff << 24) | irq);
  }
  
-static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
+static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
  {
         unsigned int irq;
         int status;
@@ -342,14 +342,14 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
  
         irq = (unsigned int)irq_map[virq].hwirq;
         if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
-               return;
+               return -1;
  
         status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq);
  
         if (status) {
                 printk(KERN_ERR "%s: ibm,get-xive irq=%u returns %d\n",
                         __func__, irq, status);
-               return;
+               return -1;
         }
  
         /*
@@ -363,7 +363,7 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
                 printk(KERN_WARNING
                         "%s: No online cpus in the mask %s for irq %d\n",
                         __func__, cpulist, virq);
-               return;
+               return -1;
         }
  
         status = rtas_call(ibm_set_xive, 3, 1, NULL,
@@ -372,8 +372,10 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
         if (status) {
                 printk(KERN_ERR "%s: ibm,set-xive irq=%u returns %d\n",
                         __func__, irq, status);
-               return;
+               return -1;
         }
+
+       return 0;
  }
  
  static struct irq_chip xics_pic_direct = {
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c

index 0efc12d1a3d77f98716f500c5ca12f9f923e3e9d..352d8c3ef5269c97e548789edccc49c420214a71 100644 (file)
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -807,7 +807,7 @@ static void mpic_end_ipi(unsigned int irq)
  
  #endif /* CONFIG_SMP */
  
-void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+int mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
  {
         struct mpic *mpic = mpic_from_irq(irq);
         unsigned int src = mpic_irq_to_hw(irq);
@@ -824,6 +824,8 @@ void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
                 mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION),
                                mpic_physmask(cpus_addr(tmp)[0]));
         }
+
+       return 0;
  }
  
  static unsigned int mpic_type_to_vecpri(struct mpic *mpic, unsigned int type)
diff --git a/arch/powerpc/sysdev/mpic.h b/arch/powerpc/sysdev/mpic.h

index 3cef2af10f4254daea1ff69769117ffb1f8a23b3..eff433c322a0c55632b7d413acfddc18ed5cbaa8 100644 (file)
--- a/arch/powerpc/sysdev/mpic.h
+++ b/arch/powerpc/sysdev/mpic.h
@@ -36,6 +36,6 @@ static inline int mpic_pasemi_msi_init(struct mpic *mpic)
  
  extern int mpic_set_irq_type(unsigned int virq, unsigned int flow_type);
  extern void mpic_set_vector(unsigned int virq, unsigned int vector);
-extern void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask);
+extern int mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask);
  
  #endif /* _POWERPC_SYSDEV_MPIC_H */
diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h

index 639ac805448ab500497270013b9176510849062a..65865726b28319238b16f109a032510a14b624e5 100644 (file)
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -102,8 +102,8 @@ struct thread_info {
  #define TI_KERN_CNTD1  0x00000488
  #define TI_PCR         0x00000490
  #define TI_RESTART_BLOCK 0x00000498
-#define TI_KUNA_REGS   0x000004c0
-#define TI_KUNA_INSN   0x000004c8
+#define TI_KUNA_REGS   0x000004c8
+#define TI_KUNA_INSN   0x000004d0
  #define TI_FPREGS      0x00000500
  
  /* We embed this in the uppermost byte of thread_info->flags */
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c

index 5deabe921a47b6ebdcd5f6371faf7eaab3c135e4..e5e78f9cfc95d7104ede2ab767fdd1e5401f623c 100644 (file)
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -318,10 +318,12 @@ static void sun4u_irq_enable(unsigned int virt_irq)
         }
  }
  
-static void sun4u_set_affinity(unsigned int virt_irq,
+static int sun4u_set_affinity(unsigned int virt_irq,
                                const struct cpumask *mask)
  {
         sun4u_irq_enable(virt_irq);
+
+       return 0;
  }
  
  /* Don't do anything.  The desc->status check for IRQ_DISABLED in
@@ -377,7 +379,7 @@ static void sun4v_irq_enable(unsigned int virt_irq)
                        ino, err);
  }
  
-static void sun4v_set_affinity(unsigned int virt_irq,
+static int sun4v_set_affinity(unsigned int virt_irq,
                                const struct cpumask *mask)
  {
         unsigned int ino = virt_irq_table[virt_irq].dev_ino;
@@ -388,6 +390,8 @@ static void sun4v_set_affinity(unsigned int virt_irq,
         if (err != HV_EOK)
                 printk(KERN_ERR "sun4v_intr_settarget(%x,%lu): "
                        "err(%d)\n", ino, cpuid, err);
+
+       return 0;
  }
  
  static void sun4v_irq_disable(unsigned int virt_irq)
@@ -445,7 +449,7 @@ static void sun4v_virq_enable(unsigned int virt_irq)
                        dev_handle, dev_ino, err);
  }
  
-static void sun4v_virt_set_affinity(unsigned int virt_irq,
+static int sun4v_virt_set_affinity(unsigned int virt_irq,
                                     const struct cpumask *mask)
  {
         unsigned long cpuid, dev_handle, dev_ino;
@@ -461,6 +465,8 @@ static void sun4v_virt_set_affinity(unsigned int virt_irq,
                 printk(KERN_ERR "sun4v_vintr_set_target(%lx,%lx,%lu): "
                        "err(%d)\n",
                        dev_handle, dev_ino, cpuid, err);
+
+       return 0;
  }
  
  static void sun4v_virq_disable(unsigned int virt_irq)
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild

new file mode 100644 (file)

index 0000000..ad8ec35
--- /dev/null
+++ b/arch/x86/Kbuild
@@ -0,0 +1,16 @@
+
+obj-$(CONFIG_KVM) += kvm/
+
+# Xen paravirtualization support
+obj-$(CONFIG_XEN) += xen/
+
+# lguest paravirtualization support
+obj-$(CONFIG_LGUEST_GUEST) += lguest/
+
+obj-y += kernel/
+obj-y += mm/
+
+obj-y += crypto/
+obj-y += vdso/
+obj-$(CONFIG_IA32_EMULATION) += ia32/
+
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index a6efe0a2e9ae613a81bedec5e4772698d16541cc..68f5578fe38ef2bc366fad67855098c73097f3f1 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -47,6 +47,11 @@ config X86
         select HAVE_KERNEL_BZIP2
         select HAVE_KERNEL_LZMA
  
+config OUTPUT_FORMAT
+       string
+       default "elf32-i386" if X86_32
+       default "elf64-x86-64" if X86_64
+
  config ARCH_DEFCONFIG
         string
         default "arch/x86/configs/i386_defconfig" if X86_32
@@ -274,15 +279,9 @@ config SPARSE_IRQ
  
           If you don't know what to do here, say N.
  
-config NUMA_MIGRATE_IRQ_DESC
-       bool "Move irq desc when changing irq smp_affinity"
+config NUMA_IRQ_DESC
+       def_bool y
         depends on SPARSE_IRQ && NUMA
-       depends on BROKEN
-       default n
-       ---help---
-         This enables moving irq_desc to cpu/node that irq will use handled.
-
-         If you don't know what to do here, say N.
  
  config X86_MPPARSE
         bool "Enable MPS table" if ACPI
@@ -355,7 +354,7 @@ config X86_UV
         depends on X86_64
         depends on X86_EXTENDED_PLATFORM
         depends on NUMA
-       select X86_X2APIC
+       depends on X86_X2APIC
         ---help---
           This option is needed in order to support SGI Ultraviolet systems.
           If you don't have one of these, you should say N here.
@@ -740,6 +739,7 @@ config X86_UP_IOAPIC
  config X86_LOCAL_APIC
         def_bool y
         depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
+       select HAVE_PERF_COUNTERS if (!M386 && !M486)
  
  config X86_IO_APIC
         def_bool y
@@ -1466,9 +1466,7 @@ config KEXEC_JUMP
  
  config PHYSICAL_START
         hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
-       default "0x1000000" if X86_NUMAQ
-       default "0x200000" if X86_64
-       default "0x100000"
+       default "0x1000000"
         ---help---
           This gives the physical address where the kernel is loaded.
  
@@ -1487,15 +1485,15 @@ config PHYSICAL_START
           to be specifically compiled to run from a specific memory area
           (normally a reserved region) and this option comes handy.
  
-         So if you are using bzImage for capturing the crash dump, leave
-         the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y.
-         Otherwise if you plan to use vmlinux for capturing the crash dump
-         change this value to start of the reserved region (Typically 16MB
-         0x1000000). In other words, it can be set based on the "X" value as
-         specified in the "crashkernel=YM@XM" command line boot parameter
-         passed to the panic-ed kernel. Typically this parameter is set as
-         crashkernel=64M@16M. Please take a look at
-         Documentation/kdump/kdump.txt for more details about crash dumps.
+         So if you are using bzImage for capturing the crash dump,
+         leave the value here unchanged to 0x1000000 and set
+         CONFIG_RELOCATABLE=y.  Otherwise if you plan to use vmlinux
+         for capturing the crash dump change this value to start of
+         the reserved region.  In other words, it can be set based on
+         the "X" value as specified in the "crashkernel=YM@XM"
+         command line boot parameter passed to the panic-ed
+         kernel. Please take a look at Documentation/kdump/kdump.txt
+         for more details about crash dumps.
  
           Usage of bzImage for capturing the crash dump is recommended as
           one does not have to build two kernels. Same kernel can be used
@@ -1508,8 +1506,8 @@ config PHYSICAL_START
           Don't change this unless you know what you are doing.
  
  config RELOCATABLE
-       bool "Build a relocatable kernel (EXPERIMENTAL)"
-       depends on EXPERIMENTAL
+       bool "Build a relocatable kernel"
+       default y
         ---help---
           This builds a kernel image that retains relocation information
           so it can be loaded someplace besides the default 1MB.
@@ -1524,12 +1522,16 @@ config RELOCATABLE
           it has been loaded at and the compile time physical address
           (CONFIG_PHYSICAL_START) is ignored.
  
+# Relocation on x86-32 needs some additional build support
+config X86_NEED_RELOCS
+       def_bool y
+       depends on X86_32 && RELOCATABLE
+
  config PHYSICAL_ALIGN
         hex
         prompt "Alignment value to which kernel should be aligned" if X86_32
-       default "0x100000" if X86_32
-       default "0x200000" if X86_64
-       range 0x2000 0x400000
+       default "0x1000000"
+       range 0x2000 0x1000000
         ---help---
           This value puts the alignment restrictions on physical address
           where kernel is loaded and run from. Kernel is compiled for an
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug

index d8359e73317f8dfb8b929f15f555f5593469e7fd..d105f29bb6bb7c9b75fe3369d66f68f2f3ada5ba 100644 (file)
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -159,14 +159,30 @@ config IOMMU_DEBUG
           options. See Documentation/x86_64/boot-options.txt for more
           details.
  
+config IOMMU_STRESS
+       bool "Enable IOMMU stress-test mode"
+       ---help---
+         This option disables various optimizations in IOMMU related
+         code to do real stress testing of the IOMMU code. This option
+         will cause a performance drop and should only be enabled for
+         testing.
+
  config IOMMU_LEAK
         bool "IOMMU leak tracing"
-       depends on DEBUG_KERNEL
-       depends on IOMMU_DEBUG
+       depends on IOMMU_DEBUG && DMA_API_DEBUG
         ---help---
           Add a simple leak tracer to the IOMMU code. This is useful when you
           are debugging a buggy device driver that leaks IOMMU mappings.
  
+config X86_DS_SELFTEST
+    bool "DS selftest"
+    default y
+    depends on DEBUG_KERNEL
+    depends on X86_DS
+       ---help---
+         Perform Debug Store selftests at boot time.
+         If in doubt, say "N".
+
  config HAVE_MMIOTRACE_SUPPORT
         def_bool y
  
diff --git a/arch/x86/Makefile b/arch/x86/Makefile

index 8c86b72afdc2d39bc8957692e35335ebabae82bc..edbd0ca620678fd6627c60d4849a07852a823b11 100644 (file)
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -7,8 +7,6 @@ else
          KBUILD_DEFCONFIG := $(ARCH)_defconfig
  endif
  
-core-$(CONFIG_KVM) += arch/x86/kvm/
-
  # BITS is used as extension for files which are available in a 32 bit
  # and a 64 bit version to simplify shared Makefiles.
  # e.g.: obj-y += foo_$(BITS).o
@@ -118,21 +116,8 @@ head-y += arch/x86/kernel/init_task.o
  
  libs-y  += arch/x86/lib/
  
-# Sub architecture files that needs linking first
-core-y += $(fcore-y)
-
-# Xen paravirtualization support
-core-$(CONFIG_XEN) += arch/x86/xen/
-
-# lguest paravirtualization support
-core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
-
-core-y += arch/x86/kernel/
-core-y += arch/x86/mm/
-
-core-y += arch/x86/crypto/
-core-y += arch/x86/vdso/
-core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/
+# See arch/x86/Kbuild for content of core part of the kernel
+core-y += arch/x86/
  
  # drivers-y are linked after core-y
  drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore

index 172cf8a98bdd2541e0d930e2a92903f5f8f9d448..851fe936d2421a14d7a558e2946321503d73a5b6 100644 (file)
--- a/arch/x86/boot/.gitignore
+++ b/arch/x86/boot/.gitignore
@@ -3,6 +3,8 @@ bzImage
  cpustr.h
  mkcpustr
  offsets.h
+voffset.h
+zoffset.h
  setup
  setup.bin
  setup.elf
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile

index 6633b6e7505a68cc32ebc198f9ec6cd46cd0d9c3..8d16ada250480cb57168ea8b600c920f7c1f5192 100644 (file)
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -26,9 +26,10 @@ targets              := vmlinux.bin setup.bin setup.elf bzImage
  targets                += fdimage fdimage144 fdimage288 image.iso mtools.conf
  subdir-                := compressed
  
-setup-y                += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
+setup-y                += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o
  setup-y                += header.o main.o mca.o memory.o pm.o pmjump.o
-setup-y                += printf.o string.o tty.o video.o video-mode.o version.o
+setup-y                += printf.o regs.o string.o tty.o video.o video-mode.o
+setup-y                += version.o
  setup-$(CONFIG_X86_APM_BOOT) += apm.o
  
  # The link order of the video-*.o modules can matter.  In particular,
@@ -86,19 +87,27 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
  
  SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
  
-sed-offsets := -e 's/^00*/0/' \
-        -e 's/^\([0-9a-fA-F]*\) . \(input_data\|input_data_end\)$$/\#define \2 0x\1/p'
+sed-voffset := -e 's/^\([0-9a-fA-F]*\) . \(_text\|_end\)$$/\#define VO_\2 0x\1/p'
  
-quiet_cmd_offsets = OFFSETS $@
-      cmd_offsets = $(NM) $< | sed -n $(sed-offsets) > $@
+quiet_cmd_voffset = VOFFSET $@
+      cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
  
-$(obj)/offsets.h: $(obj)/compressed/vmlinux FORCE
-       $(call if_changed,offsets)
+targets += voffset.h
+$(obj)/voffset.h: vmlinux FORCE
+       $(call if_changed,voffset)
+
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
+
+quiet_cmd_zoffset = ZOFFSET $@
+      cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
+
+targets += zoffset.h
+$(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
+       $(call if_changed,zoffset)
  
-targets += offsets.h
  
  AFLAGS_header.o += -I$(obj)
-$(obj)/header.o: $(obj)/offsets.h
+$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h
  
  LDFLAGS_setup.elf      := -T
  $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c

index 7c19ce8c2442c219a1e85ee22798a8abaa0bdf2c..64a31a6d751a0132b9b09ea71da1b31b5ec955d8 100644 (file)
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -2,7 +2,7 @@
   *
   *   Copyright (C) 1991, 1992 Linus Torvalds
   *   Copyright 2007-2008 rPath, Inc. - All Rights Reserved
- *   Copyright 2009 Intel Corporation
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
   *
   *   This file is part of the Linux kernel, and is made available under
   *   the terms of the GNU General Public License version 2.
@@ -90,8 +90,11 @@ static int a20_test_long(void)
  
  static void enable_a20_bios(void)
  {
-       asm volatile("pushfl; int $0x15; popfl"
-                    : : "a" ((u16)0x2401));
+       struct biosregs ireg;
+
+       initregs(&ireg);
+       ireg.ax = 0x2401;
+       intcall(0x15, &ireg, NULL);
  }
  
  static void enable_a20_kbc(void)
diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c

index 7aa6033001f9ace30ef1a4ffdb52bfd629a98f26..ee274834ea8ba35cf0a804d8be7cf07fa0077993 100644 (file)
--- a/arch/x86/boot/apm.c
+++ b/arch/x86/boot/apm.c
@@ -2,6 +2,7 @@
   *
   *   Copyright (C) 1991, 1992 Linus Torvalds
   *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
   *
   *   Original APM BIOS checking by Stephen Rothwell, May 1994
   *   (sfr@canb.auug.org.au)
@@ -19,75 +20,56 @@
  
  int query_apm_bios(void)
  {
-       u16 ax, bx, cx, dx, di;
-       u32 ebx, esi;
-       u8 err;
+       struct biosregs ireg, oreg;
  
         /* APM BIOS installation check */
-       ax = 0x5300;
-       bx = cx = 0;
-       asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
-                    : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
-                    : : "esi", "edi");
+       initregs(&ireg);
+       ireg.ah = 0x53;
+       intcall(0x15, &ireg, &oreg);
  
-       if (err)
+       if (oreg.flags & X86_EFLAGS_CF)
                 return -1;              /* No APM BIOS */
  
-       if (bx != 0x504d)       /* "PM" signature */
+       if (oreg.bx != 0x504d)          /* "PM" signature */
                 return -1;
  
-       if (!(cx & 0x02))               /* 32 bits supported? */
+       if (!(oreg.cx & 0x02))          /* 32 bits supported? */
                 return -1;
  
         /* Disconnect first, just in case */
-       ax = 0x5304;
-       bx = 0;
-       asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
-                    : "+a" (ax), "+b" (bx)
-                    : : "ecx", "edx", "esi", "edi");
-
-       /* Paranoia */
-       ebx = esi = 0;
-       cx = dx = di = 0;
+       ireg.al = 0x04;
+       intcall(0x15, &ireg, NULL);
  
         /* 32-bit connect */
-       asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %6"
-                    : "=a" (ax), "+b" (ebx), "+c" (cx), "+d" (dx),
-                      "+S" (esi), "+D" (di), "=m" (err)
-                    : "a" (0x5303));
-
-       boot_params.apm_bios_info.cseg = ax;
-       boot_params.apm_bios_info.offset = ebx;
-       boot_params.apm_bios_info.cseg_16 = cx;
-       boot_params.apm_bios_info.dseg = dx;
-       boot_params.apm_bios_info.cseg_len = (u16)esi;
-       boot_params.apm_bios_info.cseg_16_len = esi >> 16;
-       boot_params.apm_bios_info.dseg_len = di;
-
-       if (err)
+       ireg.al = 0x03;
+       intcall(0x15, &ireg, &oreg);
+
+       boot_params.apm_bios_info.cseg        = oreg.ax;
+       boot_params.apm_bios_info.offset      = oreg.ebx;
+       boot_params.apm_bios_info.cseg_16     = oreg.cx;
+       boot_params.apm_bios_info.dseg        = oreg.dx;
+       boot_params.apm_bios_info.cseg_len    = oreg.si;
+       boot_params.apm_bios_info.cseg_16_len = oreg.hsi;
+       boot_params.apm_bios_info.dseg_len    = oreg.di;
+
+       if (oreg.flags & X86_EFLAGS_CF)
                 return -1;
  
         /* Redo the installation check as the 32-bit connect;
            some BIOSes return different flags this way... */
  
-       ax = 0x5300;
-       bx = cx = 0;
-       asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
-                    : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
-                    : : "esi", "edi");
+       ireg.al = 0x00;
+       intcall(0x15, &ireg, &oreg);
  
-       if (err || bx != 0x504d) {
+       if ((oreg.eflags & X86_EFLAGS_CF) || oreg.bx != 0x504d) {
                 /* Failure with 32-bit connect, try to disconect and ignore */
-               ax = 0x5304;
-               bx = 0;
-               asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
-                            : "+a" (ax), "+b" (bx)
-                            : : "ecx", "edx", "esi", "edi");
+               ireg.al = 0x04;
+               intcall(0x15, &ireg, NULL);
                 return -1;
         }
  
-       boot_params.apm_bios_info.version = ax;
-       boot_params.apm_bios_info.flags = cx;
+       boot_params.apm_bios_info.version = oreg.ax;
+       boot_params.apm_bios_info.flags   = oreg.cx;
         return 0;
  }
  
diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S

new file mode 100644 (file)

index 0000000..5077937
--- /dev/null
+++ b/arch/x86/boot/bioscall.S
@@ -0,0 +1,82 @@
+/* -----------------------------------------------------------------------
+ *
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2 or (at your
+ *   option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * "Glove box" for BIOS calls.  Avoids the constant problems with BIOSes
+ * touching registers they shouldn't be.
+ */
+
+       .code16
+       .text
+       .globl  intcall
+       .type   intcall, @function
+intcall:
+       /* Self-modify the INT instruction.  Ugly, but works. */
+       cmpb    %al, 3f
+       je      1f
+       movb    %al, 3f
+       jmp     1f              /* Synchronize pipeline */
+1:
+       /* Save state */
+       pushfl
+       pushw   %fs
+       pushw   %gs
+       pushal
+
+       /* Copy input state to stack frame */
+       subw    $44, %sp
+       movw    %dx, %si
+       movw    %sp, %di
+       movw    $11, %cx
+       rep; movsd
+
+       /* Pop full state from the stack */
+       popal
+       popw    %gs
+       popw    %fs
+       popw    %es
+       popw    %ds
+       popfl
+
+       /* Actual INT */
+       .byte   0xcd            /* INT opcode */
+3:     .byte   0
+
+       /* Push full state to the stack */
+       pushfl
+       pushw   %ds
+       pushw   %es
+       pushw   %fs
+       pushw   %gs
+       pushal
+
+       /* Re-establish C environment invariants */
+       cld
+       movzwl  %sp, %esp
+       movw    %cs, %ax
+       movw    %ax, %ds
+       movw    %ax, %es
+
+       /* Copy output state from stack frame */
+       movw    68(%esp), %di   /* Original %cx == 3rd argument */
+       andw    %di, %di
+       jz      4f
+       movw    %sp, %si
+       movw    $11, %cx
+       rep; movsd
+4:     addw    $44, %sp
+
+       /* Restore state and return */
+       popal
+       popw    %gs
+       popw    %fs
+       popfl
+       retl
+       .size   intcall, .-intcall
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h

index 7b2692e897e5fcd75f53073dcf543665d838eb6b..98239d2658f27b3ed3494ea1b0ad3d6cd0a01624 100644 (file)
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -2,6 +2,7 @@
   *
   *   Copyright (C) 1991, 1992 Linus Torvalds
   *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
   *
   *   This file is part of the Linux kernel, and is made available under
   *   the terms of the GNU General Public License version 2.
@@ -26,6 +27,7 @@
  #include <asm/setup.h>
  #include "bitops.h"
  #include <asm/cpufeature.h>
+#include <asm/processor-flags.h>
  
  /* Useful macros */
  #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -241,6 +243,49 @@ int enable_a20(void);
  /* apm.c */
  int query_apm_bios(void);
  
+/* bioscall.c */
+struct biosregs {
+       union {
+               struct {
+                       u32 edi;
+                       u32 esi;
+                       u32 ebp;
+                       u32 _esp;
+                       u32 ebx;
+                       u32 edx;
+                       u32 ecx;
+                       u32 eax;
+                       u32 _fsgs;
+                       u32 _dses;
+                       u32 eflags;
+               };
+               struct {
+                       u16 di, hdi;
+                       u16 si, hsi;
+                       u16 bp, hbp;
+                       u16 _sp, _hsp;
+                       u16 bx, hbx;
+                       u16 dx, hdx;
+                       u16 cx, hcx;
+                       u16 ax, hax;
+                       u16 gs, fs;
+                       u16 es, ds;
+                       u16 flags, hflags;
+               };
+               struct {
+                       u8 dil, dih, edi2, edi3;
+                       u8 sil, sih, esi2, esi3;
+                       u8 bpl, bph, ebp2, ebp3;
+                       u8 _spl, _sph, _esp2, _esp3;
+                       u8 bl, bh, ebx2, ebx3;
+                       u8 dl, dh, edx2, edx3;
+                       u8 cl, ch, ecx2, ecx3;
+                       u8 al, ah, eax2, eax3;
+               };
+       };
+};
+void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
+
  /* cmdline.c */
  int cmdline_find_option(const char *option, char *buffer, int bufsize);
  int cmdline_find_option_bool(const char *option);
@@ -279,6 +324,9 @@ int sprintf(char *buf, const char *fmt, ...);
  int vsprintf(char *buf, const char *fmt, va_list args);
  int printf(const char *fmt, ...);
  
+/* regs.c */
+void initregs(struct biosregs *regs);
+
  /* string.c */
  int strcmp(const char *str1, const char *str2);
  size_t strnlen(const char *s, size_t maxlen);
diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore

index 63eff3b04d0181be4eb6163522989760d82e23b7..4a46fab7162e3b753d143eaedbbf1dcaabf8822d 100644 (file)
--- a/arch/x86/boot/compressed/.gitignore
+++ b/arch/x86/boot/compressed/.gitignore
@@ -1,3 +1,6 @@
  relocs
  vmlinux.bin.all
  vmlinux.relocs
+vmlinux.lds
+mkpiggy
+piggy.S
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile

index 65551c9f85718baf762d203329b6be42082a2b9c..49c8a4c37d7c3b29bc7f5fb4d0ea660aec6fdf11 100644 (file)
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -19,7 +19,9 @@ KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__
  LDFLAGS := -m elf_$(UTS_MACHINE)
  LDFLAGS_vmlinux := -T
  
-$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
+hostprogs-y    := mkpiggy
+
+$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
         $(call if_changed,ld)
         @:
  
@@ -29,7 +31,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE
  
  
  targets += vmlinux.bin.all vmlinux.relocs relocs
-hostprogs-$(CONFIG_X86_32) += relocs
+hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs
  
  quiet_cmd_relocs = RELOCS  $@
        cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
@@ -37,46 +39,22 @@ $(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
         $(call if_changed,relocs)
  
  vmlinux.bin.all-y := $(obj)/vmlinux.bin
-vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs
-quiet_cmd_relocbin = BUILD   $@
-      cmd_relocbin = cat $(filter-out FORCE,$^) > $@
-$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
-       $(call if_changed,relocbin)
-
-ifeq ($(CONFIG_X86_32),y)
+vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
  
-ifdef CONFIG_RELOCATABLE
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
-       $(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin.all FORCE
-       $(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin.all FORCE
-       $(call if_changed,lzma)
-else
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
+$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
         $(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
+$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
         $(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
+$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
         $(call if_changed,lzma)
-endif
-LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
  
-else
+suffix-$(CONFIG_KERNEL_GZIP)   := gz
+suffix-$(CONFIG_KERNEL_BZIP2)  := bz2
+suffix-$(CONFIG_KERNEL_LZMA)   := lzma
  
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
-       $(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
-       $(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
-       $(call if_changed,lzma)
-
-LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
-endif
+quiet_cmd_mkpiggy = MKPIGGY $@
+      cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
  
-suffix_$(CONFIG_KERNEL_GZIP)  = gz
-suffix_$(CONFIG_KERNEL_BZIP2) = bz2
-suffix_$(CONFIG_KERNEL_LZMA)  = lzma
-
-$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE
-       $(call if_changed,ld)
+targets += piggy.S
+$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE
+       $(call if_changed,mkpiggy)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S

index 3a8a866fb2e291e958478cb741a56d16b1016e5f..75e4f001e7061e5d3036197ac202dbee8e3d07a2 100644 (file)
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -12,16 +12,16 @@
   * the page directory. [According to comments etc elsewhere on a compressed
   * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC]
   *
- * Page 0 is deliberately kept safe, since System Management Mode code in 
+ * Page 0 is deliberately kept safe, since System Management Mode code in
   * laptops may need to access the BIOS data stored there.  This is also
- * useful for future device drivers that either access the BIOS via VM86 
+ * useful for future device drivers that either access the BIOS via VM86
   * mode.
   */
  
  /*
   * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
   */
-.text
+       .text
  
  #include <linux/linkage.h>
  #include <asm/segment.h>
@@ -29,161 +29,151 @@
  #include <asm/boot.h>
  #include <asm/asm-offsets.h>
  
-.section ".text.head","ax",@progbits
+       .section ".text.head","ax",@progbits
  ENTRY(startup_32)
         cld
-       /* test KEEP_SEGMENTS flag to see if the bootloader is asking
-        * us to not reload segments */
-       testb $(1<<6), BP_loadflags(%esi)
-       jnz 1f
+       /*
+        * Test KEEP_SEGMENTS flag to see if the bootloader is asking
+        * us to not reload segments
+        */
+       testb   $(1<<6), BP_loadflags(%esi)
+       jnz     1f
  
         cli
-       movl $(__BOOT_DS),%eax
-       movl %eax,%ds
-       movl %eax,%es
-       movl %eax,%fs
-       movl %eax,%gs
-       movl %eax,%ss
+       movl    $__BOOT_DS, %eax
+       movl    %eax, %ds
+       movl    %eax, %es
+       movl    %eax, %fs
+       movl    %eax, %gs
+       movl    %eax, %ss
  1:
  
-/* Calculate the delta between where we were compiled to run
+/*
+ * Calculate the delta between where we were compiled to run
   * at and where we were actually loaded at.  This can only be done
   * with a short local call on x86.  Nothing  else will tell us what
   * address we are running at.  The reserved chunk of the real-mode
   * data at 0x1e4 (defined as a scratch field) are used as the stack
   * for this calculation. Only 4 bytes are needed.
   */
-       leal (0x1e4+4)(%esi), %esp
-       call 1f
-1:     popl %ebp
-       subl $1b, %ebp
+       leal    (BP_scratch+4)(%esi), %esp
+       call    1f
+1:     popl    %ebp
+       subl    $1b, %ebp
  
-/* %ebp contains the address we are loaded at by the boot loader and %ebx
+/*
+ * %ebp contains the address we are loaded at by the boot loader and %ebx
   * contains the address where we should move the kernel image temporarily
   * for safe in-place decompression.
   */
  
  #ifdef CONFIG_RELOCATABLE
-       movl    %ebp, %ebx
-       addl    $(CONFIG_PHYSICAL_ALIGN - 1), %ebx
-       andl    $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx
+       movl    %ebp, %ebx
+       movl    BP_kernel_alignment(%esi), %eax
+       decl    %eax
+       addl    %eax, %ebx
+       notl    %eax
+       andl    %eax, %ebx
  #else
-       movl $LOAD_PHYSICAL_ADDR, %ebx
+       movl    $LOAD_PHYSICAL_ADDR, %ebx
  #endif
  
-       /* Replace the compressed data size with the uncompressed size */
-       subl input_len(%ebp), %ebx
-       movl output_len(%ebp), %eax
-       addl %eax, %ebx
-       /* Add 8 bytes for every 32K input block */
-       shrl $12, %eax
-       addl %eax, %ebx
-       /* Add 32K + 18 bytes of extra slack */
-       addl $(32768 + 18), %ebx
-       /* Align on a 4K boundary */
-       addl $4095, %ebx
-       andl $~4095, %ebx
-
-/* Copy the compressed kernel to the end of our buffer
+       /* Target address to relocate to for decompression */
+       addl    $z_extract_offset, %ebx
+
+       /* Set up the stack */
+       leal    boot_stack_end(%ebx), %esp
+
+       /* Zero EFLAGS */
+       pushl   $0
+       popfl
+
+/*
+ * Copy the compressed kernel to the end of our buffer
   * where decompression in place becomes safe.
   */
-       pushl %esi
-       leal _end(%ebp), %esi
-       leal _end(%ebx), %edi
-       movl $(_end - startup_32), %ecx
+       pushl   %esi
+       leal    (_bss-4)(%ebp), %esi
+       leal    (_bss-4)(%ebx), %edi
+       movl    $(_bss - startup_32), %ecx
+       shrl    $2, %ecx
         std
-       rep
-       movsb
+       rep     movsl
         cld
-       popl %esi
-
-/* Compute the kernel start address.
- */
-#ifdef CONFIG_RELOCATABLE
-       addl    $(CONFIG_PHYSICAL_ALIGN - 1), %ebp
-       andl    $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp
-#else
-       movl    $LOAD_PHYSICAL_ADDR, %ebp
-#endif
+       popl    %esi
  
  /*
   * Jump to the relocated address.
   */
-       leal relocated(%ebx), %eax
-       jmp *%eax
+       leal    relocated(%ebx), %eax
+       jmp     *%eax
  ENDPROC(startup_32)
  
-.section ".text"
+       .text
  relocated:
  
  /*
- * Clear BSS
- */
-       xorl %eax,%eax
-       leal _edata(%ebx),%edi
-       leal _end(%ebx), %ecx
-       subl %edi,%ecx
-       cld
-       rep
-       stosb
-
-/*
- * Setup the stack for the decompressor
+ * Clear BSS (stack is currently empty)
   */
-       leal boot_stack_end(%ebx), %esp
+       xorl    %eax, %eax
+       leal    _bss(%ebx), %edi
+       leal    _ebss(%ebx), %ecx
+       subl    %edi, %ecx
+       shrl    $2, %ecx
+       rep     stosl
  
  /*
   * Do the decompression, and jump to the new kernel..
   */
-       movl output_len(%ebx), %eax
-       pushl %eax
-                       # push arguments for decompress_kernel:
-       pushl %ebp      # output address
-       movl input_len(%ebx), %eax
-       pushl %eax      # input_len
-       leal input_data(%ebx), %eax
-       pushl %eax      # input_data
-       leal boot_heap(%ebx), %eax
-       pushl %eax      # heap area
-       pushl %esi      # real mode pointer
-       call decompress_kernel
-       addl $20, %esp
-       popl %ecx
+       leal    z_extract_offset_negative(%ebx), %ebp
+                               /* push arguments for decompress_kernel: */
+       pushl   %ebp            /* output address */
+       pushl   $z_input_len    /* input_len */
+       leal    input_data(%ebx), %eax
+       pushl   %eax            /* input_data */
+       leal    boot_heap(%ebx), %eax
+       pushl   %eax            /* heap area */
+       pushl   %esi            /* real mode pointer */
+       call    decompress_kernel
+       addl    $20, %esp
  
  #if CONFIG_RELOCATABLE
-/* Find the address of the relocations.
+/*
+ * Find the address of the relocations.
   */
-       movl %ebp, %edi
-       addl %ecx, %edi
+       leal    z_output_len(%ebp), %edi
  
-/* Calculate the delta between where vmlinux was compiled to run
+/*
+ * Calculate the delta between where vmlinux was compiled to run
   * and where it was actually loaded.
   */
-       movl %ebp, %ebx
-       subl $LOAD_PHYSICAL_ADDR, %ebx
-       jz   2f         /* Nothing to be done if loaded at compiled addr. */
+       movl    %ebp, %ebx
+       subl    $LOAD_PHYSICAL_ADDR, %ebx
+       jz      2f      /* Nothing to be done if loaded at compiled addr. */
  /*
   * Process relocations.
   */
  
-1:     subl $4, %edi
-       movl 0(%edi), %ecx
-       testl %ecx, %ecx
-       jz 2f
-       addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
-       jmp 1b
+1:     subl    $4, %edi
+       movl    (%edi), %ecx
+       testl   %ecx, %ecx
+       jz      2f
+       addl    %ebx, -__PAGE_OFFSET(%ebx, %ecx)
+       jmp     1b
  2:
  #endif
  
  /*
   * Jump to the decompressed kernel.
   */
-       xorl %ebx,%ebx
-       jmp *%ebp
+       xorl    %ebx, %ebx
+       jmp     *%ebp
  
-.bss
-/* Stack and heap for uncompression */
-.balign 4
+/*
+ * Stack and heap for uncompression
+ */
+       .bss
+       .balign 4
  boot_heap:
         .fill BOOT_HEAP_SIZE, 1, 0
  boot_stack:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S

index ed4a8294800268c25667b5ede9bc42e2cdd229c9..f62c284db9eb5c94edf1df794b26e13ace0b4113 100644 (file)
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -21,8 +21,8 @@
  /*
   * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
   */
-.code32
-.text
+       .code32
+       .text
  
  #include <linux/linkage.h>
  #include <asm/segment.h>
@@ -33,12 +33,14 @@
  #include <asm/processor-flags.h>
  #include <asm/asm-offsets.h>
  
-.section ".text.head"
+       .section ".text.head"
         .code32
  ENTRY(startup_32)
         cld
-       /* test KEEP_SEGMENTS flag to see if the bootloader is asking
-        * us to not reload segments */
+       /*
+        * Test KEEP_SEGMENTS flag to see if the bootloader is asking
+        * us to not reload segments
+        */
         testb $(1<<6), BP_loadflags(%esi)
         jnz 1f
  
@@ -49,14 +51,15 @@ ENTRY(startup_32)
         movl    %eax, %ss
  1:
  
-/* Calculate the delta between where we were compiled to run
+/*
+ * Calculate the delta between where we were compiled to run
   * at and where we were actually loaded at.  This can only be done
   * with a short local call on x86.  Nothing  else will tell us what
   * address we are running at.  The reserved chunk of the real-mode
   * data at 0x1e4 (defined as a scratch field) are used as the stack
   * for this calculation. Only 4 bytes are needed.
   */
-       leal    (0x1e4+4)(%esi), %esp
+       leal    (BP_scratch+4)(%esi), %esp
         call    1f
  1:     popl    %ebp
         subl    $1b, %ebp
@@ -70,32 +73,28 @@ ENTRY(startup_32)
         testl   %eax, %eax
         jnz     no_longmode
  
-/* Compute the delta between where we were compiled to run at
+/*
+ * Compute the delta between where we were compiled to run at
   * and where the code will actually run at.
- */
-/* %ebp contains the address we are loaded at by the boot loader and %ebx
+ *
+ * %ebp contains the address we are loaded at by the boot loader and %ebx
   * contains the address where we should move the kernel image temporarily
   * for safe in-place decompression.
   */
  
  #ifdef CONFIG_RELOCATABLE
         movl    %ebp, %ebx
-       addl    $(PMD_PAGE_SIZE -1), %ebx
-       andl    $PMD_PAGE_MASK, %ebx
+       movl    BP_kernel_alignment(%esi), %eax
+       decl    %eax
+       addl    %eax, %ebx
+       notl    %eax
+       andl    %eax, %ebx
  #else
-       movl    $CONFIG_PHYSICAL_START, %ebx
+       movl    $LOAD_PHYSICAL_ADDR, %ebx
  #endif
  
-       /* Replace the compressed data size with the uncompressed size */
-       subl    input_len(%ebp), %ebx
-       movl    output_len(%ebp), %eax
-       addl    %eax, %ebx
-       /* Add 8 bytes for every 32K input block */
-       shrl    $12, %eax
-       addl    %eax, %ebx
-       /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
-       addl    $(32768 + 18 + 4095), %ebx
-       andl    $~4095, %ebx
+       /* Target address to relocate to for decompression */
+       addl    $z_extract_offset, %ebx
  
  /*
   * Prepare for entering 64 bit mode
@@ -114,7 +113,7 @@ ENTRY(startup_32)
   /*
    * Build early 4G boot pagetable
    */
-       /* Initialize Page tables to 0*/
+       /* Initialize Page tables to 0 */
         leal    pgtable(%ebx), %edi
         xorl    %eax, %eax
         movl    $((4096*6)/4), %ecx
@@ -155,7 +154,8 @@ ENTRY(startup_32)
         btsl    $_EFER_LME, %eax
         wrmsr
  
-       /* Setup for the jump to 64bit mode
+       /*
+        * Setup for the jump to 64bit mode
          *
          * When the jump is performend we will be in long mode but
          * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
@@ -184,7 +184,8 @@ no_longmode:
  
  #include "../../kernel/verify_cpu_64.S"
  
-       /* Be careful here startup_64 needs to be at a predictable
+       /*
+        * Be careful here startup_64 needs to be at a predictable
          * address so I can export it in an ELF header.  Bootloaders
          * should look at the ELF header to find this address, as
          * it may change in the future.
@@ -192,7 +193,8 @@ no_longmode:
         .code64
         .org 0x200
  ENTRY(startup_64)
-       /* We come here either from startup_32 or directly from a
+       /*
+        * We come here either from startup_32 or directly from a
          * 64bit bootloader.  If we come here from a bootloader we depend on
          * an identity mapped page table being provied that maps our
          * entire text+data+bss and hopefully all of memory.
@@ -209,50 +211,54 @@ ENTRY(startup_64)
         movl    $0x20, %eax
         ltr     %ax
  
-       /* Compute the decompressed kernel start address.  It is where
+       /*
+        * Compute the decompressed kernel start address.  It is where
          * we were loaded at aligned to a 2M boundary. %rbp contains the
          * decompressed kernel start address.
          *
          * If it is a relocatable kernel then decompress and run the kernel
          * from load address aligned to 2MB addr, otherwise decompress and
-        * run the kernel from CONFIG_PHYSICAL_START
+        * run the kernel from LOAD_PHYSICAL_ADDR
+        *
+        * We cannot rely on the calculation done in 32-bit mode, since we
+        * may have been invoked via the 64-bit entry point.
          */
  
         /* Start with the delta to where the kernel will run at. */
  #ifdef CONFIG_RELOCATABLE
         leaq    startup_32(%rip) /* - $startup_32 */, %rbp
-       addq    $(PMD_PAGE_SIZE - 1), %rbp
-       andq    $PMD_PAGE_MASK, %rbp
-       movq    %rbp, %rbx
+       movl    BP_kernel_alignment(%rsi), %eax
+       decl    %eax
+       addq    %rax, %rbp
+       notq    %rax
+       andq    %rax, %rbp
  #else
-       movq    $CONFIG_PHYSICAL_START, %rbp
-       movq    %rbp, %rbx
+       movq    $LOAD_PHYSICAL_ADDR, %rbp
  #endif
  
-       /* Replace the compressed data size with the uncompressed size */
-       movl    input_len(%rip), %eax
-       subq    %rax, %rbx
-       movl    output_len(%rip), %eax
-       addq    %rax, %rbx
-       /* Add 8 bytes for every 32K input block */
-       shrq    $12, %rax
-       addq    %rax, %rbx
-       /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
-       addq    $(32768 + 18 + 4095), %rbx
-       andq    $~4095, %rbx
-
-/* Copy the compressed kernel to the end of our buffer
+       /* Target address to relocate to for decompression */
+       leaq    z_extract_offset(%rbp), %rbx
+
+       /* Set up the stack */
+       leaq    boot_stack_end(%rbx), %rsp
+
+       /* Zero EFLAGS */
+       pushq   $0
+       popfq
+
+/*
+ * Copy the compressed kernel to the end of our buffer
   * where decompression in place becomes safe.
   */
-       leaq    _end_before_pgt(%rip), %r8
-       leaq    _end_before_pgt(%rbx), %r9
-       movq    $_end_before_pgt /* - $startup_32 */, %rcx
-1:     subq    $8, %r8
-       subq    $8, %r9
-       movq    0(%r8), %rax
-       movq    %rax, 0(%r9)
-       subq    $8, %rcx
-       jnz     1b
+       pushq   %rsi
+       leaq    (_bss-8)(%rip), %rsi
+       leaq    (_bss-8)(%rbx), %rdi
+       movq    $_bss /* - $startup_32 */, %rcx
+       shrq    $3, %rcx
+       std
+       rep     movsq
+       cld
+       popq    %rsi
  
  /*
   * Jump to the relocated address.
@@ -260,37 +266,28 @@ ENTRY(startup_64)
         leaq    relocated(%rbx), %rax
         jmp     *%rax
  
-.section ".text"
+       .text
  relocated:
  
  /*
- * Clear BSS
+ * Clear BSS (stack is currently empty)
   */
-       xorq    %rax, %rax
-       leaq    _edata(%rbx), %rdi
-       leaq    _end_before_pgt(%rbx), %rcx
+       xorl    %eax, %eax
+       leaq    _bss(%rip), %rdi
+       leaq    _ebss(%rip), %rcx
         subq    %rdi, %rcx
-       cld
-       rep
-       stosb
-
-       /* Setup the stack */
-       leaq    boot_stack_end(%rip), %rsp
-
-       /* zero EFLAGS after setting rsp */
-       pushq   $0
-       popfq
+       shrq    $3, %rcx
+       rep     stosq
  
  /*
   * Do the decompression, and jump to the new kernel..
   */
-       pushq   %rsi                    # Save the real mode argument
-       movq    %rsi, %rdi              # real mode address
-       leaq    boot_heap(%rip), %rsi   # malloc area for uncompression
-       leaq    input_data(%rip), %rdx  # input_data
-       movl    input_len(%rip), %eax
-       movq    %rax, %rcx              # input_len
-       movq    %rbp, %r8               # output
+       pushq   %rsi                    /* Save the real mode argument */
+       movq    %rsi, %rdi              /* real mode address */
+       leaq    boot_heap(%rip), %rsi   /* malloc area for uncompression */
+       leaq    input_data(%rip), %rdx  /* input_data */
+       movl    $z_input_len, %ecx      /* input_len */
+       movq    %rbp, %r8               /* output target address */
         call    decompress_kernel
         popq    %rsi
  
@@ -311,11 +308,21 @@ gdt:
         .quad   0x0000000000000000      /* TS continued */
  gdt_end:
  
-.bss
-/* Stack and heap for uncompression */
-.balign 4
+/*
+ * Stack and heap for uncompression
+ */
+       .bss
+       .balign 4
  boot_heap:
         .fill BOOT_HEAP_SIZE, 1, 0
  boot_stack:
         .fill BOOT_STACK_SIZE, 1, 0
  boot_stack_end:
+
+/*
+ * Space for page tables (not in .bss so not zeroed)
+ */
+       .section ".pgtable","a",@nobits
+       .balign 4096
+pgtable:
+       .fill 6*4096, 1, 0
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c

index e45be73684ffe3bd5559029eb86cf66ec00ff250..842b2a36174a2a2b6347851d7f1069878fb5cb4b 100644 (file)
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -325,20 +325,18 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
         free_mem_ptr     = heap;        /* Heap */
         free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
  
+       if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
+               error("Destination address inappropriately aligned");
  #ifdef CONFIG_X86_64
-       if ((unsigned long)output & (__KERNEL_ALIGN - 1))
-               error("Destination address not 2M aligned");
-       if ((unsigned long)output >= 0xffffffffffUL)
+       if (heap > 0x3fffffffffffUL)
                 error("Destination address too large");
  #else
-       if ((u32)output & (CONFIG_PHYSICAL_ALIGN - 1))
-               error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
         if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff))
                 error("Destination address too large");
+#endif
  #ifndef CONFIG_RELOCATABLE
-       if ((u32)output != LOAD_PHYSICAL_ADDR)
+       if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
                 error("Wrong destination address");
-#endif
  #endif
  
         if (!quiet)
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c

new file mode 100644 (file)

index 0000000..bcbd36c
--- /dev/null
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -0,0 +1,97 @@
+/* ----------------------------------------------------------------------- *
+ *
+ *  Copyright (C) 2009 Intel Corporation. All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License version
+ *  2 as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ *  02110-1301, USA.
+ *
+ *  H. Peter Anvin <hpa@linux.intel.com>
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Compute the desired load offset from a compressed program; outputs
+ * a small assembly wrapper with the appropriate symbols defined.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+
+static uint32_t getle32(const void *p)
+{
+       const uint8_t *cp = p;
+
+       return (uint32_t)cp[0] + ((uint32_t)cp[1] << 8) +
+               ((uint32_t)cp[2] << 16) + ((uint32_t)cp[3] << 24);
+}
+
+int main(int argc, char *argv[])
+{
+       uint32_t olen;
+       long ilen;
+       unsigned long offs;
+       FILE *f;
+
+       if (argc < 2) {
+               fprintf(stderr, "Usage: %s compressed_file\n", argv[0]);
+               return 1;
+       }
+
+       /* Get the information for the compressed kernel image first */
+
+       f = fopen(argv[1], "r");
+       if (!f) {
+               perror(argv[1]);
+               return 1;
+       }
+
+
+       if (fseek(f, -4L, SEEK_END)) {
+               perror(argv[1]);
+       }
+       fread(&olen, sizeof olen, 1, f);
+       ilen = ftell(f);
+       olen = getle32(&olen);
+       fclose(f);
+
+       /*
+        * Now we have the input (compressed) and output (uncompressed)
+        * sizes, compute the necessary decompression offset...
+        */
+
+       offs = (olen > ilen) ? olen - ilen : 0;
+       offs += olen >> 12;     /* Add 8 bytes for each 32K block */
+       offs += 32*1024 + 18;   /* Add 32K + 18 bytes slack */
+       offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
+
+       printf(".section \".rodata.compressed\",\"a\",@progbits\n");
+       printf(".globl z_input_len\n");
+       printf("z_input_len = %lu\n", ilen);
+       printf(".globl z_output_len\n");
+       printf("z_output_len = %lu\n", (unsigned long)olen);
+       printf(".globl z_extract_offset\n");
+       printf("z_extract_offset = 0x%lx\n", offs);
+       /* z_extract_offset_negative allows simplification of head_32.S */
+       printf(".globl z_extract_offset_negative\n");
+       printf("z_extract_offset_negative = -0x%lx\n", offs);
+
+       printf(".globl input_data, input_data_end\n");
+       printf("input_data:\n");
+       printf(".incbin \"%s\"\n", argv[1]);
+       printf("input_data_end:\n");
+
+       return 0;
+}
diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux.lds.S

similarity index 57%

rename from arch/x86/boot/compressed/vmlinux_64.lds

rename to arch/x86/boot/compressed/vmlinux.lds.S

index bef1ac891bce9987d2693cdc1ff601457af0d6c4..cc353e1b3ffd49185f2cf0683e21894462f0c1de 100644 (file)
--- a/arch/x86/boot/compressed/vmlinux_64.lds
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -1,6 +1,17 @@
-OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+
+#undef i386
+
+#include <asm/page_types.h>
+
+#ifdef CONFIG_X86_64
  OUTPUT_ARCH(i386:x86-64)
  ENTRY(startup_64)
+#else
+OUTPUT_ARCH(i386)
+ENTRY(startup_32)
+#endif
+
  SECTIONS
  {
         /* Be careful parts of head_64.S assume startup_32 is at
@@ -33,16 +44,22 @@ SECTIONS
                 *(.data.*)
                 _edata = . ;
         }
+       . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
         .bss : {
                 _bss = . ;
                 *(.bss)
                 *(.bss.*)
                 *(COMMON)
-               . = ALIGN(8);
-               _end_before_pgt = . ;
-               . = ALIGN(4096);
-               pgtable = . ;
-               . = . + 4096 * 6;
+               . = ALIGN(8);   /* For convenience during zeroing */
                 _ebss = .;
         }
+#ifdef CONFIG_X86_64
+       . = ALIGN(PAGE_SIZE);
+       .pgtable : {
+               _pgtable = . ;
+               *(.pgtable)
+               _epgtable = . ;
+       }
+#endif
+       _end = .;
  }
diff --git a/arch/x86/boot/compressed/vmlinux.scr b/arch/x86/boot/compressed/vmlinux.scr

deleted file mode 100644 (file)

index f02382a..0000000
--- a/arch/x86/boot/compressed/vmlinux.scr
+++ /dev/null
@@ -1,10 +0,0 @@
-SECTIONS
-{
-  .rodata.compressed : {
-       input_len = .;
-       LONG(input_data_end - input_data) input_data = .;
-       *(.data)
-       output_len = . - 4;
-       input_data_end = .;
-       }
-}
diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds

deleted file mode 100644 (file)

index bb3c483..0000000
--- a/arch/x86/boot/compressed/vmlinux_32.lds
+++ /dev/null
@@ -1,43 +0,0 @@
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(startup_32)
-SECTIONS
-{
-       /* Be careful parts of head_32.S assume startup_32 is at
-        * address 0.
-        */
-       . = 0;
-       .text.head : {
-               _head = . ;
-               *(.text.head)
-               _ehead = . ;
-       }
-       .rodata.compressed : {
-               *(.rodata.compressed)
-       }
-       .text : {
-               _text = .;      /* Text */
-               *(.text)
-               *(.text.*)
-               _etext = . ;
-       }
-       .rodata : {
-               _rodata = . ;
-               *(.rodata)       /* read-only data */
-               *(.rodata.*)
-               _erodata = . ;
-       }
-       .data : {
-               _data = . ;
-               *(.data)
-               *(.data.*)
-               _edata = . ;
-       }
-       .bss : {
-               _bss = . ;
-               *(.bss)
-               *(.bss.*)
-               *(COMMON)
-               _end = . ;
-       }
-}
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c

index 1aae8f3e5ca1912b6eb60ad75e3c120af1110bed..c501a5b466f8495c26d3eff4de1cddb97613fca4 100644 (file)
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -2,6 +2,7 @@
   *
   *   Copyright (C) 1991, 1992 Linus Torvalds
   *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
   *
   *   This file is part of the Linux kernel, and is made available under
   *   the terms of the GNU General Public License version 2.
@@ -22,17 +23,17 @@
   */
  static int read_mbr(u8 devno, void *buf)
  {
-       u16 ax, bx, cx, dx;
+       struct biosregs ireg, oreg;
  
-       ax = 0x0201;            /* Legacy Read, one sector */
-       cx = 0x0001;            /* Sector 0-0-1 */
-       dx = devno;
-       bx = (size_t)buf;
-       asm volatile("pushfl; stc; int $0x13; setc %%al; popfl"
-                    : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx)
-                    : : "esi", "edi", "memory");
+       initregs(&ireg);
+       ireg.ax = 0x0201;               /* Legacy Read, one sector */
+       ireg.cx = 0x0001;               /* Sector 0-0-1 */
+       ireg.dl = devno;
+       ireg.bx = (size_t)buf;
  
-       return -(u8)ax;         /* 0 or -1 */
+       intcall(0x13, &ireg, &oreg);
+
+       return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
  }
  
  static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
@@ -72,56 +73,46 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
  
  static int get_edd_info(u8 devno, struct edd_info *ei)
  {
-       u16 ax, bx, cx, dx, di;
+       struct biosregs ireg, oreg;
  
         memset(ei, 0, sizeof *ei);
  
         /* Check Extensions Present */
  
-       ax = 0x4100;
-       bx = EDDMAGIC1;
-       dx = devno;
-       asm("pushfl; stc; int $0x13; setc %%al; popfl"
-           : "+a" (ax), "+b" (bx), "=c" (cx), "+d" (dx)
-           : : "esi", "edi");
+       initregs(&ireg);
+       ireg.ah = 0x41;
+       ireg.bx = EDDMAGIC1;
+       ireg.dl = devno;
+       intcall(0x13, &ireg, &oreg);
  
-       if ((u8)ax)
+       if (oreg.eflags & X86_EFLAGS_CF)
                 return -1;      /* No extended information */
  
-       if (bx != EDDMAGIC2)
+       if (oreg.bx != EDDMAGIC2)
                 return -1;
  
         ei->device  = devno;
-       ei->version = ax >> 8;  /* EDD version number */
-       ei->interface_support = cx; /* EDD functionality subsets */
+       ei->version = oreg.ah;           /* EDD version number */
+       ei->interface_support = oreg.cx; /* EDD functionality subsets */
  
         /* Extended Get Device Parameters */
  
         ei->params.length = sizeof(ei->params);
-       ax = 0x4800;
-       dx = devno;
-       asm("pushfl; int $0x13; popfl"
-           : "+a" (ax), "+d" (dx), "=m" (ei->params)
-           : "S" (&ei->params)
-           : "ebx", "ecx", "edi");
+       ireg.ah = 0x48;
+       ireg.si = (size_t)&ei->params;
+       intcall(0x13, &ireg, &oreg);
  
         /* Get legacy CHS parameters */
  
         /* Ralf Brown recommends setting ES:DI to 0:0 */
-       ax = 0x0800;
-       dx = devno;
-       di = 0;
-       asm("pushw %%es; "
-           "movw %%di,%%es; "
-           "pushfl; stc; int $0x13; setc %%al; popfl; "
-           "popw %%es"
-           : "+a" (ax), "=b" (bx), "=c" (cx), "+d" (dx), "+D" (di)
-           : : "esi");
-
-       if ((u8)ax == 0) {
-               ei->legacy_max_cylinder = (cx >> 8) + ((cx & 0xc0) << 2);
-               ei->legacy_max_head = dx >> 8;
-               ei->legacy_sectors_per_track = cx & 0x3f;
+       ireg.ah = 0x08;
+       ireg.es = 0;
+       intcall(0x13, &ireg, &oreg);
+
+       if (!(oreg.eflags & X86_EFLAGS_CF)) {
+               ei->legacy_max_cylinder = oreg.ch + ((oreg.cl & 0xc0) << 2);
+               ei->legacy_max_head = oreg.dh;
+               ei->legacy_sectors_per_track = oreg.cl & 0x3f;
         }
  
         return 0;
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S

index 5d84d1c74e4c6d2a84666c7b7125641b00a1dcad..b31cc54b46410f89b4120702b1d1b018a79e204d 100644 (file)
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -22,7 +22,8 @@
  #include <asm/page_types.h>
  #include <asm/setup.h>
  #include "boot.h"
-#include "offsets.h"
+#include "voffset.h"
+#include "zoffset.h"
  
  BOOTSEG                = 0x07C0                /* original address of boot-sector */
  SYSSEG         = 0x1000                /* historical load address >> 4 */
@@ -115,7 +116,7 @@ _start:
         # Part 2 of the header, from the old setup.S
  
                 .ascii  "HdrS"          # header signature
-               .word   0x0209          # header version number (>= 0x0105)
+               .word   0x020a          # header version number (>= 0x0105)
                                         # or else old loadlin-1.5 will fail)
                 .globl realmode_swtch
  realmode_swtch:        .word   0, 0            # default_switch, SETUPSEG
@@ -168,7 +169,11 @@ heap_end_ptr:      .word   _end+STACK_SIZE-512
                                         # end of setup code can be used by setup
                                         # for local heap purposes.
  
-pad1:          .word   0
+ext_loader_ver:
+               .byte   0               # Extended boot loader version
+ext_loader_type:
+               .byte   0               # Extended boot loader type
+
  cmd_line_ptr:  .long   0               # (Header version 0x0202 or later)
                                         # If nonzero, a 32-bit pointer
                                         # to the kernel command line.
@@ -200,7 +205,7 @@ relocatable_kernel:    .byte 1
  #else
  relocatable_kernel:    .byte 0
  #endif
-pad2:                  .byte 0
+min_alignment:         .byte MIN_KERNEL_ALIGN_LG2      # minimum alignment
  pad3:                  .word 0
  
  cmdline_size:   .long   COMMAND_LINE_SIZE-1     #length of the command line,
@@ -212,16 +217,27 @@ hardware_subarch: .long 0                 # subarchitecture, added with 2.07
  
  hardware_subarch_data: .quad 0
  
-payload_offset:                .long input_data
-payload_length:                .long input_data_end-input_data
+payload_offset:                .long ZO_input_data
+payload_length:                .long ZO_z_input_len
  
  setup_data:            .quad 0                 # 64-bit physical pointer to
                                                 # single linked list of
                                                 # struct setup_data
  
+pref_address:          .quad LOAD_PHYSICAL_ADDR        # preferred load addr
+
+#define ZO_INIT_SIZE   (ZO__end - ZO_startup_32 + ZO_z_extract_offset)
+#define VO_INIT_SIZE   (VO__end - VO__text)
+#if ZO_INIT_SIZE > VO_INIT_SIZE
+#define INIT_SIZE ZO_INIT_SIZE
+#else
+#define INIT_SIZE VO_INIT_SIZE
+#endif
+init_size:             .long INIT_SIZE         # kernel initialization size
+
  # End of setup header #####################################################
  
-       .section ".inittext", "ax"
+       .section ".entrytext", "ax"
  start_of_setup:
  #ifdef SAFE_RESET_DISK_CONTROLLER
  # Reset the disk controller.
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c

index 58f0415d3ae09261d55c11d12855e3bc75fc4bdf..140172b895bd32c32ca2819ef776718f77df3103 100644 (file)
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -2,6 +2,7 @@
   *
   *   Copyright (C) 1991, 1992 Linus Torvalds
   *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
   *
   *   This file is part of the Linux kernel, and is made available under
   *   the terms of the GNU General Public License version 2.
@@ -61,11 +62,10 @@ static void copy_boot_params(void)
   */
  static void keyboard_set_repeat(void)
  {
-       u16 ax = 0x0305;
-       u16 bx = 0;
-       asm volatile("int $0x16"
-                    : "+a" (ax), "+b" (bx)
-                    : : "ecx", "edx", "esi", "edi");
+       struct biosregs ireg;
+       initregs(&ireg);
+       ireg.ax = 0x0305;
+       intcall(0x16, &ireg, NULL);
  }
  
  /*
@@ -73,18 +73,22 @@ static void keyboard_set_repeat(void)
   */
  static void query_ist(void)
  {
+       struct biosregs ireg, oreg;
+
         /* Some older BIOSes apparently crash on this call, so filter
            it from machines too old to have SpeedStep at all. */
         if (cpu.level < 6)
                 return;
  
-       asm("int $0x15"
-           : "=a" (boot_params.ist_info.signature),
-             "=b" (boot_params.ist_info.command),
-             "=c" (boot_params.ist_info.event),
-             "=d" (boot_params.ist_info.perf_level)
-           : "a" (0x0000e980),  /* IST Support */
-             "d" (0x47534943)); /* Request value */
+       initregs(&ireg);
+       ireg.ax  = 0xe980;       /* IST Support */
+       ireg.edx = 0x47534943;   /* Request value */
+       intcall(0x15, &ireg, &oreg);
+
+       boot_params.ist_info.signature  = oreg.eax;
+       boot_params.ist_info.command    = oreg.ebx;
+       boot_params.ist_info.event      = oreg.ecx;
+       boot_params.ist_info.perf_level = oreg.edx;
  }
  
  /*
@@ -93,13 +97,12 @@ static void query_ist(void)
  static void set_bios_mode(void)
  {
  #ifdef CONFIG_X86_64
-       u32 eax, ebx;
+       struct biosregs ireg;
  
-       eax = 0xec00;
-       ebx = 2;
-       asm volatile("int $0x15"
-                    : "+a" (eax), "+b" (ebx)
-                    : : "ecx", "edx", "esi", "edi");
+       initregs(&ireg);
+       ireg.ax = 0xec00;
+       ireg.bx = 2;
+       intcall(0x15, &ireg, NULL);
  #endif
  }
  
diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c

index 911eaae5d696427c98fdad7f344a14bdc0340efc..a95a531148ef9562ca408a9f451914748159b71e 100644 (file)
--- a/arch/x86/boot/mca.c
+++ b/arch/x86/boot/mca.c
@@ -2,6 +2,7 @@
   *
   *   Copyright (C) 1991, 1992 Linus Torvalds
   *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
   *
   *   This file is part of the Linux kernel, and is made available under
   *   the terms of the GNU General Public License version 2.
@@ -16,26 +17,22 @@
  
  int query_mca(void)
  {
-       u8 err;
-       u16 es, bx, len;
-
-       asm("pushw %%es ; "
-           "int $0x15 ; "
-           "setc %0 ; "
-           "movw %%es, %1 ; "
-           "popw %%es"
-           : "=acd" (err), "=acdSD" (es), "=b" (bx)
-           : "a" (0xc000));
-
-       if (err)
+       struct biosregs ireg, oreg;
+       u16 len;
+
+       initregs(&ireg);
+       ireg.ah = 0xc0;
+       intcall(0x15, &ireg, &oreg);
+
+       if (oreg.eflags & X86_EFLAGS_CF)
                 return -1;      /* No MCA present */
  
-       set_fs(es);
-       len = rdfs16(bx);
+       set_fs(oreg.es);
+       len = rdfs16(oreg.bx);
  
         if (len > sizeof(boot_params.sys_desc_table))
                 len = sizeof(boot_params.sys_desc_table);
  
-       copy_from_fs(&boot_params.sys_desc_table, bx, len);
+       copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len);
         return 0;
  }
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c

index 74b3d2ba84e90a39bb29f166278ce47bcf56437c..cae3feb1035e3da73826b518de4d47c15ad4e58d 100644 (file)
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -20,12 +20,16 @@
  static int detect_memory_e820(void)
  {
         int count = 0;
-       u32 next = 0;
-       u32 size, id, edi;
-       u8 err;
+       struct biosregs ireg, oreg;
         struct e820entry *desc = boot_params.e820_map;
         static struct e820entry buf; /* static so it is zeroed */
  
+       initregs(&ireg);
+       ireg.ax  = 0xe820;
+       ireg.cx  = sizeof buf;
+       ireg.edx = SMAP;
+       ireg.di  = (size_t)&buf;
+
         /*
          * Note: at least one BIOS is known which assumes that the
          * buffer pointed to by one e820 call is the same one as
@@ -41,22 +45,13 @@ static int detect_memory_e820(void)
          */
  
         do {
-               size = sizeof buf;
-
-               /* Important: %edx and %esi are clobbered by some BIOSes,
-                  so they must be either used for the error output
-                  or explicitly marked clobbered.  Given that, assume there
-                  is something out there clobbering %ebp and %edi, too. */
-               asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0"
-                   : "=d" (err), "+b" (next), "=a" (id), "+c" (size),
-                     "=D" (edi), "+m" (buf)
-                   : "D" (&buf), "d" (SMAP), "a" (0xe820)
-                   : "esi");
+               intcall(0x15, &ireg, &oreg);
+               ireg.ebx = oreg.ebx; /* for next iteration... */
  
                 /* BIOSes which terminate the chain with CF = 1 as opposed
                    to %ebx = 0 don't always report the SMAP signature on
                    the final, failing, probe. */
-               if (err)
+               if (oreg.eflags & X86_EFLAGS_CF)
                         break;
  
                 /* Some BIOSes stop returning SMAP in the middle of
@@ -64,60 +59,64 @@ static int detect_memory_e820(void)
                    screwed up the map at that point, we might have a
                    partial map, the full map, or complete garbage, so
                    just return failure. */
-               if (id != SMAP) {
+               if (oreg.eax != SMAP) {
                         count = 0;
                         break;
                 }
  
                 *desc++ = buf;
                 count++;
-       } while (next && count < ARRAY_SIZE(boot_params.e820_map));
+       } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map));
  
         return boot_params.e820_entries = count;
  }
  
  static int detect_memory_e801(void)
  {
-       u16 ax, bx, cx, dx;
-       u8 err;
+       struct biosregs ireg, oreg;
  
-       bx = cx = dx = 0;
-       ax = 0xe801;
-       asm("stc; int $0x15; setc %0"
-           : "=m" (err), "+a" (ax), "+b" (bx), "+c" (cx), "+d" (dx));
+       initregs(&ireg);
+       ireg.ax = 0xe801;
+       intcall(0x15, &ireg, &oreg);
  
-       if (err)
+       if (oreg.eflags & X86_EFLAGS_CF)
                 return -1;
  
         /* Do we really need to do this? */
-       if (cx || dx) {
-               ax = cx;
-               bx = dx;
+       if (oreg.cx || oreg.dx) {
+               oreg.ax = oreg.cx;
+               oreg.bx = oreg.dx;
         }
  
-       if (ax > 15*1024)
+       if (oreg.ax > 15*1024) {
                 return -1;      /* Bogus! */
-
-       /* This ignores memory above 16MB if we have a memory hole
-          there.  If someone actually finds a machine with a memory
-          hole at 16MB and no support for 0E820h they should probably
-          generate a fake e820 map. */
-       boot_params.alt_mem_k = (ax == 15*1024) ? (dx << 6)+ax : ax;
+       } else if (oreg.ax == 15*1024) {
+               boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax;
+       } else {
+               /*
+                * This ignores memory above 16MB if we have a memory
+                * hole there.  If someone actually finds a machine
+                * with a memory hole at 16MB and no support for
+                * 0E820h they should probably generate a fake e820
+                * map.
+                */
+               boot_params.alt_mem_k = oreg.ax;
+       }
  
         return 0;
  }
  
  static int detect_memory_88(void)
  {
-       u16 ax;
-       u8 err;
+       struct biosregs ireg, oreg;
  
-       ax = 0x8800;
-       asm("stc; int $0x15; setc %0" : "=bcdm" (err), "+a" (ax));
+       initregs(&ireg);
+       ireg.ah = 0x88;
+       intcall(0x15, &ireg, &oreg);
  
-       boot_params.screen_info.ext_mem_k = ax;
+       boot_params.screen_info.ext_mem_k = oreg.ax;
  
-       return -err;
+       return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
  }
  
  int detect_memory(void)
diff --git a/arch/x86/boot/regs.c b/arch/x86/boot/regs.c

new file mode 100644 (file)

index 0000000..958019b
--- /dev/null
+++ b/arch/x86/boot/regs.c
@@ -0,0 +1,29 @@
+/* -----------------------------------------------------------------------
+ *
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2 or (at your
+ *   option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Simple helper function for initializing a register set.
+ *
+ * Note that this sets EFLAGS_CF in the input register set; this
+ * makes it easier to catch functions which do nothing but don't
+ * explicitly set CF.
+ */
+
+#include "boot.h"
+
+void initregs(struct biosregs *reg)
+{
+       memset(reg, 0, sizeof *reg);
+       reg->eflags |= X86_EFLAGS_CF;
+       reg->ds = ds();
+       reg->es = ds();
+       reg->fs = fs();
+       reg->gs = gs();
+}
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld

index bb8dc2de796936c6a21d0b1bfbd934ca8a3e5877..0f6ec455a2b13a58151861b2e8d307776d866742 100644 (file)
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -15,8 +15,11 @@ SECTIONS
  
         . = 497;
         .header         : { *(.header) }
+       .entrytext      : { *(.entrytext) }
         .inittext       : { *(.inittext) }
         .initdata       : { *(.initdata) }
+       __end_init = .;
+
         .text           : { *(.text) }
         .text32         : { *(.text32) }
  
@@ -52,4 +55,7 @@ SECTIONS
  
         . = ASSERT(_end <= 0x8000, "Setup too big!");
         . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
+       /* Necessary for the very-old-loader check to work... */
+       . = ASSERT(__end_init <= 5*512, "init sections too big!");
+
  }
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c

index 7e8e8b25f5f6c89defd0df4064a61e3e16a4ea06..01ec69c901c7a595e88748f24880a5104af553fe 100644 (file)
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -2,6 +2,7 @@
   *
   *   Copyright (C) 1991, 1992 Linus Torvalds
   *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
   *
   *   This file is part of the Linux kernel, and is made available under
   *   the terms of the GNU General Public License version 2.
@@ -22,24 +23,23 @@
  
  void __attribute__((section(".inittext"))) putchar(int ch)
  {
-       unsigned char c = ch;
+       struct biosregs ireg;
  
-       if (c == '\n')
+       if (ch == '\n')
                 putchar('\r');  /* \n -> \r\n */
  
-       /* int $0x10 is known to have bugs involving touching registers
-          it shouldn't.  Be extra conservative... */
-       asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal"
-                    : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch));
+       initregs(&ireg);
+       ireg.bx = 0x0007;
+       ireg.cx = 0x0001;
+       ireg.ah = 0x0e;
+       ireg.al = ch;
+       intcall(0x10, &ireg, NULL);
  }
  
  void __attribute__((section(".inittext"))) puts(const char *str)
  {
-       int n = 0;
-       while (*str) {
+       while (*str)
                 putchar(*str++);
-               n++;
-       }
  }
  
  /*
@@ -49,14 +49,13 @@ void __attribute__((section(".inittext"))) puts(const char *str)
  
  static u8 gettime(void)
  {
-       u16 ax = 0x0200;
-       u16 cx, dx;
+       struct biosregs ireg, oreg;
  
-       asm volatile("int $0x1a"
-                    : "+a" (ax), "=c" (cx), "=d" (dx)
-                    : : "ebx", "esi", "edi");
+       initregs(&ireg);
+       ireg.ah = 0x02;
+       intcall(0x1a, &ireg, &oreg);
  
-       return dx >> 8;
+       return oreg.dh;
  }
  
  /*
@@ -64,19 +63,24 @@ static u8 gettime(void)
   */
  int getchar(void)
  {
-       u16 ax = 0;
-       asm volatile("int $0x16" : "+a" (ax));
+       struct biosregs ireg, oreg;
+
+       initregs(&ireg);
+       /* ireg.ah = 0x00; */
+       intcall(0x16, &ireg, &oreg);
  
-       return ax & 0xff;
+       return oreg.al;
  }
  
  static int kbd_pending(void)
  {
-       u8 pending;
-       asm volatile("int $0x16; setnz %0"
-                    : "=qm" (pending)
-                    : "a" (0x0100));
-       return pending;
+       struct biosregs ireg, oreg;
+
+       initregs(&ireg);
+       ireg.ah = 0x01;
+       intcall(0x16, &ireg, &oreg);
+
+       return !(oreg.eflags & X86_EFLAGS_ZF);
  }
  
  void kbd_flush(void)
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c

index 3fa979c9c363a5dbe6f9ce6d754636ed301d01b1..d660be4923634ebfc0160b468a569289c02bce90 100644 (file)
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -2,6 +2,7 @@
   *
   *   Copyright (C) 1991, 1992 Linus Torvalds
   *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
   *
   *   This file is part of the Linux kernel, and is made available under
   *   the terms of the GNU General Public License version 2.
@@ -29,21 +30,21 @@ static int bios_set_mode(struct mode_info *mi)
  
  static int set_bios_mode(u8 mode)
  {
-       u16 ax;
+       struct biosregs ireg, oreg;
         u8 new_mode;
  
-       ax = mode;              /* AH=0x00 Set Video Mode */
-       asm volatile(INT10
-                    : "+a" (ax)
-                    : : "ebx", "ecx", "edx", "esi", "edi");
+       initregs(&ireg);
+       ireg.al = mode;         /* AH=0x00 Set Video Mode */
+       intcall(0x10, &ireg, NULL);
  
-       ax = 0x0f00;            /* Get Current Video Mode */
-       asm volatile(INT10
-                    : "+a" (ax)
-                    : : "ebx", "ecx", "edx", "esi", "edi");
+
+       ireg.ah = 0x0f;         /* Get Current Video Mode */
+       intcall(0x10, &ireg, &oreg);
  
         do_restore = 1;         /* Assume video contents were lost */
-       new_mode = ax & 0x7f;   /* Not all BIOSes are clean with the top bit */
+
+       /* Not all BIOSes are clean with the top bit */
+       new_mode = ireg.al & 0x7f;
  
         if (new_mode == mode)
                 return 0;       /* Mode change OK */
@@ -53,10 +54,8 @@ static int set_bios_mode(u8 mode)
                 /* Mode setting failed, but we didn't end up where we
                    started.  That's bad.  Try to revert to the original
                    video mode. */
-               ax = boot_params.screen_info.orig_video_mode;
-               asm volatile(INT10
-                            : "+a" (ax)
-                            : : "ebx", "ecx", "edx", "esi", "edi");
+               ireg.ax = boot_params.screen_info.orig_video_mode;
+               intcall(0x10, &ireg, NULL);
         }
  #endif
         return -1;
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c

index 4a58c8ce3f6960534931409905103aa7465aa9d6..c700147d6ffb24b72fbbf43ff954d4cfd268f137 100644 (file)
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -2,6 +2,7 @@
   *
   *   Copyright (C) 1991, 1992 Linus Torvalds
   *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
   *
   *   This file is part of the Linux kernel, and is made available under
   *   the terms of the GNU General Public License version 2.
@@ -31,7 +32,7 @@ static inline void vesa_store_mode_params_graphics(void) {}
  static int vesa_probe(void)
  {
  #if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID)
-       u16 ax, cx, di;
+       struct biosregs ireg, oreg;
         u16 mode;
         addr_t mode_ptr;
         struct mode_info *mi;
@@ -39,13 +40,12 @@ static int vesa_probe(void)
  
         video_vesa.modes = GET_HEAP(struct mode_info, 0);
  
-       ax = 0x4f00;
-       di = (size_t)&vginfo;
-       asm(INT10
-           : "+a" (ax), "+D" (di), "=m" (vginfo)
-           : : "ebx", "ecx", "edx", "esi");
+       initregs(&ireg);
+       ireg.ax = 0x4f00;
+       ireg.di = (size_t)&vginfo;
+       intcall(0x10, &ireg, &oreg);
  
-       if (ax != 0x004f ||
+       if (ireg.ax != 0x004f ||
             vginfo.signature != VESA_MAGIC ||
             vginfo.version < 0x0102)
                 return 0;       /* Not present */
@@ -65,14 +65,12 @@ static int vesa_probe(void)
  
                 memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
  
-               ax = 0x4f01;
-               cx = mode;
-               di = (size_t)&vminfo;
-               asm(INT10
-                   : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
-                   : : "ebx", "edx", "esi");
+               ireg.ax = 0x4f01;
+               ireg.cx = mode;
+               ireg.di = (size_t)&vminfo;
+               intcall(0x10, &ireg, &oreg);
  
-               if (ax != 0x004f)
+               if (ireg.ax != 0x004f)
                         continue;
  
                 if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -111,20 +109,19 @@ static int vesa_probe(void)
  
  static int vesa_set_mode(struct mode_info *mode)
  {
-       u16 ax, bx, cx, di;
+       struct biosregs ireg, oreg;
         int is_graphic;
         u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA;
  
         memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
  
-       ax = 0x4f01;
-       cx = vesa_mode;
-       di = (size_t)&vminfo;
-       asm(INT10
-           : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
-           : : "ebx", "edx", "esi");
+       initregs(&ireg);
+       ireg.ax = 0x4f01;
+       ireg.cx = vesa_mode;
+       ireg.di = (size_t)&vminfo;
+       intcall(0x10, &ireg, &oreg);
  
-       if (ax != 0x004f)
+       if (oreg.ax != 0x004f)
                 return -1;
  
         if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -141,14 +138,12 @@ static int vesa_set_mode(struct mode_info *mode)
         }
  
  
-       ax = 0x4f02;
-       bx = vesa_mode;
-       di = 0;
-       asm volatile(INT10
-                    : "+a" (ax), "+b" (bx), "+D" (di)
-                    : : "ecx", "edx", "esi");
+       initregs(&ireg);
+       ireg.ax = 0x4f02;
+       ireg.bx = vesa_mode;
+       intcall(0x10, &ireg, &oreg);
  
-       if (ax != 0x004f)
+       if (oreg.ax != 0x004f)
                 return -1;
  
         graphic_mode = is_graphic;
@@ -171,50 +166,45 @@ static int vesa_set_mode(struct mode_info *mode)
  /* Switch DAC to 8-bit mode */
  static void vesa_dac_set_8bits(void)
  {
+       struct biosregs ireg, oreg;
         u8 dac_size = 6;
  
         /* If possible, switch the DAC to 8-bit mode */
         if (vginfo.capabilities & 1) {
-               u16 ax, bx;
-
-               ax = 0x4f08;
-               bx = 0x0800;
-               asm volatile(INT10
-                            : "+a" (ax), "+b" (bx)
-                            : : "ecx", "edx", "esi", "edi");
-
-               if (ax == 0x004f)
-                       dac_size = bx >> 8;
+               initregs(&ireg);
+               ireg.ax = 0x4f08;
+               ireg.bh = 0x08;
+               intcall(0x10, &ireg, &oreg);
+               if (oreg.ax == 0x004f)
+                       dac_size = oreg.bh;
         }
  
         /* Set the color sizes to the DAC size, and offsets to 0 */
-       boot_params.screen_info.red_size = dac_size;
+       boot_params.screen_info.red_size   = dac_size;
         boot_params.screen_info.green_size = dac_size;
-       boot_params.screen_info.blue_size = dac_size;
-       boot_params.screen_info.rsvd_size = dac_size;
+       boot_params.screen_info.blue_size  = dac_size;
+       boot_params.screen_info.rsvd_size  = dac_size;
  
-       boot_params.screen_info.red_pos = 0;
-       boot_params.screen_info.green_pos = 0;
-       boot_params.screen_info.blue_pos = 0;
-       boot_params.screen_info.rsvd_pos = 0;
+       boot_params.screen_info.red_pos    = 0;
+       boot_params.screen_info.green_pos  = 0;
+       boot_params.screen_info.blue_pos   = 0;
+       boot_params.screen_info.rsvd_pos   = 0;
  }
  
  /* Save the VESA protected mode info */
  static void vesa_store_pm_info(void)
  {
-       u16 ax, bx, di, es;
+       struct biosregs ireg, oreg;
  
-       ax = 0x4f0a;
-       bx = di = 0;
-       asm("pushw %%es; "INT10"; movw %%es,%0; popw %%es"
-           : "=d" (es), "+a" (ax), "+b" (bx), "+D" (di)
-           : : "ecx", "esi");
+       initregs(&ireg);
+       ireg.ax = 0x4f0a;
+       intcall(0x10, &ireg, &oreg);
  
-       if (ax != 0x004f)
+       if (oreg.ax != 0x004f)
                 return;
  
-       boot_params.screen_info.vesapm_seg = es;
-       boot_params.screen_info.vesapm_off = di;
+       boot_params.screen_info.vesapm_seg = oreg.es;
+       boot_params.screen_info.vesapm_off = oreg.di;
  }
  
  /*
@@ -252,7 +242,7 @@ static void vesa_store_mode_params_graphics(void)
  void vesa_store_edid(void)
  {
  #ifdef CONFIG_FIRMWARE_EDID
-       u16 ax, bx, cx, dx, di;
+       struct biosregs ireg, oreg;
  
         /* Apparently used as a nonsense token... */
         memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info);
@@ -260,33 +250,26 @@ void vesa_store_edid(void)
         if (vginfo.version < 0x0200)
                 return;         /* EDID requires VBE 2.0+ */
  
-       ax = 0x4f15;            /* VBE DDC */
-       bx = 0x0000;            /* Report DDC capabilities */
-       cx = 0;                 /* Controller 0 */
-       di = 0;                 /* ES:DI must be 0 by spec */
-
-       /* Note: The VBE DDC spec is different from the main VESA spec;
-          we genuinely have to assume all registers are destroyed here. */
-
-       asm("pushw %%es; movw %2,%%es; "INT10"; popw %%es"
-           : "+a" (ax), "+b" (bx), "+c" (cx), "+D" (di)
-           : : "esi", "edx");
+       initregs(&ireg);
+       ireg.ax = 0x4f15;               /* VBE DDC */
+       /* ireg.bx = 0x0000; */         /* Report DDC capabilities */
+       /* ireg.cx = 0; */              /* Controller 0 */
+       ireg.es = 0;                    /* ES:DI must be 0 by spec */
+       intcall(0x10, &ireg, &oreg);
  
-       if (ax != 0x004f)
+       if (oreg.ax != 0x004f)
                 return;         /* No EDID */
  
         /* BH = time in seconds to transfer EDD information */
         /* BL = DDC level supported */
  
-       ax = 0x4f15;            /* VBE DDC */
-       bx = 0x0001;            /* Read EDID */
-       cx = 0;                 /* Controller 0 */
-       dx = 0;                 /* EDID block number */
-       di =(size_t) &boot_params.edid_info; /* (ES:)Pointer to block */
-       asm(INT10
-           : "+a" (ax), "+b" (bx), "+d" (dx), "=m" (boot_params.edid_info),
-             "+c" (cx), "+D" (di)
-           : : "esi");
+       ireg.ax = 0x4f15;               /* VBE DDC */
+       ireg.bx = 0x0001;               /* Read EDID */
+       /* ireg.cx = 0; */              /* Controller 0 */
+       /* ireg.dx = 0; */              /* EDID block number */
+       ireg.es = ds();
+       ireg.di =(size_t)&boot_params.edid_info; /* (ES:)Pointer to block */
+       intcall(0x10, &ireg, &oreg);
  #endif /* CONFIG_FIRMWARE_EDID */
  }
  
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c

index 9e0587a3776868e73b62b7def725997b6ffb7699..8f8d827e254d0b8cb392bf7fe95103059ac5ca89 100644 (file)
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -2,6 +2,7 @@
   *
   *   Copyright (C) 1991, 1992 Linus Torvalds
   *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
   *
   *   This file is part of the Linux kernel, and is made available under
   *   the terms of the GNU General Public License version 2.
@@ -39,30 +40,30 @@ static __videocard video_vga;
  /* Set basic 80x25 mode */
  static u8 vga_set_basic_mode(void)
  {
+       struct biosregs ireg, oreg;
         u16 ax;
         u8 rows;
         u8 mode;
  
+       initregs(&ireg);
+
  #ifdef CONFIG_VIDEO_400_HACK
         if (adapter >= ADAPTER_VGA) {
-               asm volatile(INT10
-                            : : "a" (0x1202), "b" (0x0030)
-                            : "ecx", "edx", "esi", "edi");
+               ireg.ax = 0x1202;
+               ireg.bx = 0x0030;
+               intcall(0x10, &ireg, NULL);
         }
  #endif
  
         ax = 0x0f00;
-       asm volatile(INT10
-                    : "+a" (ax)
-                    : : "ebx", "ecx", "edx", "esi", "edi");
-
-       mode = (u8)ax;
+       intcall(0x10, &ireg, &oreg);
+       mode = oreg.al;
  
         set_fs(0);
         rows = rdfs8(0x484);    /* rows minus one */
  
  #ifndef CONFIG_VIDEO_400_HACK
-       if ((ax == 0x5003 || ax == 0x5007) &&
+       if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) &&
             (rows == 0 || rows == 24))
                 return mode;
  #endif
@@ -71,10 +72,8 @@ static u8 vga_set_basic_mode(void)
                 mode = 3;
  
         /* Set the mode */
-       ax = mode;
-       asm volatile(INT10
-                    : "+a" (ax)
-                    : : "ebx", "ecx", "edx", "esi", "edi");
+       ireg.ax = mode;         /* AH=0: set mode */
+       intcall(0x10, &ireg, NULL);
         do_restore = 1;
         return mode;
  }
@@ -82,43 +81,69 @@ static u8 vga_set_basic_mode(void)
  static void vga_set_8font(void)
  {
         /* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */
+       struct biosregs ireg;
+
+       initregs(&ireg);
  
         /* Set 8x8 font */
-       asm volatile(INT10 : : "a" (0x1112), "b" (0));
+       ireg.ax = 0x1112;
+       /* ireg.bl = 0; */
+       intcall(0x10, &ireg, NULL);
  
         /* Use alternate print screen */
-       asm volatile(INT10 : : "a" (0x1200), "b" (0x20));
+       ireg.ax = 0x1200;
+       ireg.bl = 0x20;
+       intcall(0x10, &ireg, NULL);
  
         /* Turn off cursor emulation */
-       asm volatile(INT10 : : "a" (0x1201), "b" (0x34));
+       ireg.ax = 0x1201;
+       ireg.bl = 0x34;
+       intcall(0x10, &ireg, NULL);
  
         /* Cursor is scan lines 6-7 */
-       asm volatile(INT10 : : "a" (0x0100), "c" (0x0607));
+       ireg.ax = 0x0100;
+       ireg.cx = 0x0607;
+       intcall(0x10, &ireg, NULL);
  }
  
  static void vga_set_14font(void)
  {
         /* Set 9x14 font - 80x28 on VGA */
+       struct biosregs ireg;
+
+       initregs(&ireg);
  
         /* Set 9x14 font */
-       asm volatile(INT10 : : "a" (0x1111), "b" (0));
+       ireg.ax = 0x1111;
+       /* ireg.bl = 0; */
+       intcall(0x10, &ireg, NULL);
  
         /* Turn off cursor emulation */
-       asm volatile(INT10 : : "a" (0x1201), "b" (0x34));
+       ireg.ax = 0x1201;
+       ireg.bl = 0x34;
+       intcall(0x10, &ireg, NULL);
  
         /* Cursor is scan lines 11-12 */
-       asm volatile(INT10 : : "a" (0x0100), "c" (0x0b0c));
+       ireg.ax = 0x0100;
+       ireg.cx = 0x0b0c;
+       intcall(0x10, &ireg, NULL);
  }
  
  static void vga_set_80x43(void)
  {
         /* Set 80x43 mode on VGA (not EGA) */
+       struct biosregs ireg;
+
+       initregs(&ireg);
  
         /* Set 350 scans */
-       asm volatile(INT10 : : "a" (0x1201), "b" (0x30));
+       ireg.ax = 0x1201;
+       ireg.bl = 0x30;
+       intcall(0x10, &ireg, NULL);
  
         /* Reset video mode */
-       asm volatile(INT10 : : "a" (0x0003));
+       ireg.ax = 0x0003;
+       intcall(0x10, &ireg, NULL);
  
         vga_set_8font();
  }
@@ -225,8 +250,6 @@ static int vga_set_mode(struct mode_info *mode)
   */
  static int vga_probe(void)
  {
-       u16 ega_bx;
-
         static const char *card_name[] = {
                 "CGA/MDA/HGC", "EGA", "VGA"
         };
@@ -240,26 +263,26 @@ static int vga_probe(void)
                 sizeof(ega_modes)/sizeof(struct mode_info),
                 sizeof(vga_modes)/sizeof(struct mode_info),
         };
-       u8 vga_flag;
  
-       asm(INT10
-           : "=b" (ega_bx)
-           : "a" (0x1200), "b" (0x10) /* Check EGA/VGA */
-           : "ecx", "edx", "esi", "edi");
+       struct biosregs ireg, oreg;
+
+       initregs(&ireg);
+
+       ireg.ax = 0x1200;
+       ireg.bl = 0x10;         /* Check EGA/VGA */
+       intcall(0x10, &ireg, &oreg);
  
  #ifndef _WAKEUP
-       boot_params.screen_info.orig_video_ega_bx = ega_bx;
+       boot_params.screen_info.orig_video_ega_bx = oreg.bx;
  #endif
  
         /* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */
-       if ((u8)ega_bx != 0x10) {
+       if (oreg.bl != 0x10) {
                 /* EGA/VGA */
-               asm(INT10
-                   : "=a" (vga_flag)
-                   : "a" (0x1a00)
-                   : "ebx", "ecx", "edx", "esi", "edi");
+               ireg.ax = 0x1a00;
+               intcall(0x10, &ireg, &oreg);
  
-               if (vga_flag == 0x1a) {
+               if (oreg.al == 0x1a) {
                         adapter = ADAPTER_VGA;
  #ifndef _WAKEUP
                         boot_params.screen_info.orig_video_isVGA = 1;
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c

index 3bef2c1febe936bd01f1b8d7d6b7d1eb5b1f9bb6..bad728b76fc2acff40b1e5052ae20b0e08ea5dc1 100644 (file)
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -2,6 +2,7 @@
   *
   *   Copyright (C) 1991, 1992 Linus Torvalds
   *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
   *
   *   This file is part of the Linux kernel, and is made available under
   *   the terms of the GNU General Public License version 2.
@@ -18,33 +19,29 @@
  
  static void store_cursor_position(void)
  {
-       u16 curpos;
-       u16 ax, bx;
+       struct biosregs ireg, oreg;
  
-       ax = 0x0300;
-       bx = 0;
-       asm(INT10
-           : "=d" (curpos), "+a" (ax), "+b" (bx)
-           : : "ecx", "esi", "edi");
+       initregs(&ireg);
+       ireg.ah = 0x03;
+       intcall(0x10, &ireg, &oreg);
  
-       boot_params.screen_info.orig_x = curpos;
-       boot_params.screen_info.orig_y = curpos >> 8;
+       boot_params.screen_info.orig_x = oreg.dl;
+       boot_params.screen_info.orig_y = oreg.dh;
  }
  
  static void store_video_mode(void)
  {
-       u16 ax, page;
+       struct biosregs ireg, oreg;
  
         /* N.B.: the saving of the video page here is a bit silly,
            since we pretty much assume page 0 everywhere. */
-       ax = 0x0f00;
-       asm(INT10
-           : "+a" (ax), "=b" (page)
-           : : "ecx", "edx", "esi", "edi");
+       initregs(&ireg);
+       ireg.ah = 0x0f;
+       intcall(0x10, &ireg, &oreg);
  
         /* Not all BIOSes are clean with respect to the top bit */
-       boot_params.screen_info.orig_video_mode = ax & 0x7f;
-       boot_params.screen_info.orig_video_page = page >> 8;
+       boot_params.screen_info.orig_video_mode = oreg.al & 0x7f;
+       boot_params.screen_info.orig_video_page = oreg.bh;
  }
  
  /*
@@ -257,7 +254,7 @@ static void restore_screen(void)
         int y;
         addr_t dst = 0;
         u16 *src = saved.data;
-       u16 ax, bx, dx;
+       struct biosregs ireg;
  
         if (graphic_mode)
                 return;         /* Can't restore onto a graphic mode */
@@ -296,12 +293,11 @@ static void restore_screen(void)
         }
  
         /* Restore cursor position */
-       ax = 0x0200;            /* Set cursor position */
-       bx = 0;                 /* Page number (<< 8) */
-       dx = (saved.cury << 8)+saved.curx;
-       asm volatile(INT10
-                    : "+a" (ax), "+b" (bx), "+d" (dx)
-                    : : "ecx", "esi", "edi");
+       initregs(&ireg);
+       ireg.ah = 0x02;         /* Set cursor position */
+       ireg.dh = saved.cury;
+       ireg.dl = saved.curx;
+       intcall(0x10, &ireg, NULL);
  }
  #else
  #define save_screen()          ((void)0)
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h

index ee63f5d14461517ef96b89a6c644ac6c3a60a2f6..5bb174a997fc61a3609a0a41500192f4475e676a 100644 (file)
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -112,20 +112,6 @@ extern int force_x, force_y;       /* Don't query the BIOS for cols/rows */
  extern int do_restore;         /* Restore screen contents */
  extern int graphic_mode;       /* Graphics mode with linear frame buffer */
  
-/*
- * int $0x10 is notorious for touching registers it shouldn't.
- * gcc doesn't like %ebp being clobbered, so define it as a push/pop
- * sequence here.
- *
- * A number of systems, including the original PC can clobber %bp in
- * certain circumstances, like when scrolling.  There exists at least
- * one Trident video card which could clobber DS under a set of
- * circumstances that we are unlikely to encounter (scrolling when
- * using an extended graphics mode of more than 800x600 pixels), but
- * it's cheap insurance to deal with that here.
- */
-#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp"
-
  /* Accessing VGA indexed registers */
  static inline u8 in_idx(u16 port, u8 index)
  {
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig

index 235b81d0f6f2b1083c06cc46210df33141eb2b4b..edb992ebef92e2d95a8ce5d52d6f4ff33de91529 100644 (file)
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,12 +1,13 @@
  #
  # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29-rc4
-# Tue Feb 24 15:50:58 2009
+# Linux kernel version: 2.6.30-rc2
+# Mon May 11 16:21:55 2009
  #
  # CONFIG_64BIT is not set
  CONFIG_X86_32=y
  # CONFIG_X86_64 is not set
  CONFIG_X86=y
+CONFIG_OUTPUT_FORMAT="elf32-i386"
  CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
  CONFIG_GENERIC_TIME=y
  CONFIG_GENERIC_CMOS_UPDATE=y
@@ -33,6 +34,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
  CONFIG_ARCH_HAS_DEFAULT_IDLE=y
  CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
  CONFIG_HAVE_SETUP_PER_CPU_AREA=y
+CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
  # CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set
  CONFIG_ARCH_HIBERNATION_POSSIBLE=y
  CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -40,15 +42,16 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y
  CONFIG_ARCH_POPULATES_NODE_MAP=y
  # CONFIG_AUDIT_ARCH is not set
  CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
+CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
  CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
  CONFIG_GENERIC_IRQ_PROBE=y
  CONFIG_GENERIC_PENDING_IRQ=y
-CONFIG_X86_SMP=y
  CONFIG_USE_GENERIC_SMP_HELPERS=y
  CONFIG_X86_32_SMP=y
  CONFIG_X86_HT=y
-CONFIG_X86_BIOS_REBOOT=y
  CONFIG_X86_TRAMPOLINE=y
+CONFIG_X86_32_LAZY_GS=y
  CONFIG_KTIME_SCALAR=y
  CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
  
@@ -60,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
  CONFIG_INIT_ENV_ARG_LIMIT=32
  CONFIG_LOCALVERSION=""
  # CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_HAVE_KERNEL_GZIP=y
+CONFIG_HAVE_KERNEL_BZIP2=y
+CONFIG_HAVE_KERNEL_LZMA=y
+CONFIG_KERNEL_GZIP=y
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
  CONFIG_SWAP=y
  CONFIG_SYSVIPC=y
  CONFIG_SYSVIPC_SYSCTL=y
  CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
  CONFIG_BSD_PROCESS_ACCT=y
  # CONFIG_BSD_PROCESS_ACCT_V3 is not set
  CONFIG_TASKSTATS=y
@@ -113,23 +123,26 @@ CONFIG_PID_NS=y
  CONFIG_NET_NS=y
  CONFIG_BLK_DEV_INITRD=y
  CONFIG_INITRAMFS_SOURCE=""
+CONFIG_RD_GZIP=y
+CONFIG_RD_BZIP2=y
+CONFIG_RD_LZMA=y
  CONFIG_CC_OPTIMIZE_FOR_SIZE=y
  CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
  # CONFIG_EMBEDDED is not set
  CONFIG_UID16=y
  CONFIG_SYSCTL_SYSCALL=y
  CONFIG_KALLSYMS=y
  CONFIG_KALLSYMS_ALL=y
  CONFIG_KALLSYMS_EXTRA_PASS=y
+# CONFIG_STRIP_ASM_SYMS is not set
  CONFIG_HOTPLUG=y
  CONFIG_PRINTK=y
  CONFIG_BUG=y
  CONFIG_ELF_CORE=y
  CONFIG_PCSPKR_PLATFORM=y
-# CONFIG_COMPAT_BRK is not set
  CONFIG_BASE_FULL=y
  CONFIG_FUTEX=y
-CONFIG_ANON_INODES=y
  CONFIG_EPOLL=y
  CONFIG_SIGNALFD=y
  CONFIG_TIMERFD=y
@@ -139,6 +152,7 @@ CONFIG_AIO=y
  CONFIG_VM_EVENT_COUNTERS=y
  CONFIG_PCI_QUIRKS=y
  CONFIG_SLUB_DEBUG=y
+# CONFIG_COMPAT_BRK is not set
  # CONFIG_SLAB is not set
  CONFIG_SLUB=y
  # CONFIG_SLOB is not set
@@ -154,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
  CONFIG_HAVE_KPROBES=y
  CONFIG_HAVE_KRETPROBES=y
  CONFIG_HAVE_ARCH_TRACEHOOK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
  CONFIG_HAVE_GENERIC_DMA_COHERENT=y
  CONFIG_SLABINFO=y
  CONFIG_RT_MUTEXES=y
@@ -167,7 +183,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
  CONFIG_STOP_MACHINE=y
  CONFIG_BLOCK=y
  # CONFIG_LBD is not set
-CONFIG_BLK_DEV_IO_TRACE=y
  CONFIG_BLK_DEV_BSG=y
  # CONFIG_BLK_DEV_INTEGRITY is not set
  
@@ -194,12 +209,12 @@ CONFIG_HIGH_RES_TIMERS=y
  CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
  CONFIG_SMP=y
  CONFIG_SPARSE_IRQ=y
-CONFIG_X86_FIND_SMP_CONFIG=y
  CONFIG_X86_MPPARSE=y
+# CONFIG_X86_BIGSMP is not set
+CONFIG_X86_EXTENDED_PLATFORM=y
  # CONFIG_X86_ELAN is not set
-# CONFIG_X86_GENERICARCH is not set
-# CONFIG_X86_VSMP is not set
  # CONFIG_X86_RDC321X is not set
+# CONFIG_X86_32_NON_STANDARD is not set
  CONFIG_SCHED_OMIT_FRAME_POINTER=y
  # CONFIG_PARAVIRT_GUEST is not set
  # CONFIG_MEMTEST is not set
@@ -230,8 +245,10 @@ CONFIG_M686=y
  # CONFIG_GENERIC_CPU is not set
  CONFIG_X86_GENERIC=y
  CONFIG_X86_CPU=y
+CONFIG_X86_L1_CACHE_BYTES=64
+CONFIG_X86_INTERNODE_CACHE_BYTES=64
  CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_L1_CACHE_SHIFT=5
  CONFIG_X86_XADD=y
  # CONFIG_X86_PPRO_FENCE is not set
  CONFIG_X86_WP_WORKS_OK=y
@@ -247,7 +264,7 @@ CONFIG_X86_DEBUGCTLMSR=y
  CONFIG_CPU_SUP_INTEL=y
  CONFIG_CPU_SUP_CYRIX_32=y
  CONFIG_CPU_SUP_AMD=y
-CONFIG_CPU_SUP_CENTAUR_32=y
+CONFIG_CPU_SUP_CENTAUR=y
  CONFIG_CPU_SUP_TRANSMETA_32=y
  CONFIG_CPU_SUP_UMC_32=y
  CONFIG_X86_DS=y
@@ -279,6 +296,7 @@ CONFIG_MICROCODE_AMD=y
  CONFIG_MICROCODE_OLD_INTERFACE=y
  CONFIG_X86_MSR=y
  CONFIG_X86_CPUID=y
+# CONFIG_X86_CPU_DEBUG is not set
  # CONFIG_NOHIGHMEM is not set
  CONFIG_HIGHMEM4G=y
  # CONFIG_HIGHMEM64G is not set
@@ -302,6 +320,8 @@ CONFIG_ZONE_DMA_FLAG=1
  CONFIG_BOUNCE=y
  CONFIG_VIRT_TO_BUS=y
  CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
  CONFIG_HIGHPTE=y
  CONFIG_X86_CHECK_BIOS_CORRUPTION=y
  CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
@@ -312,6 +332,7 @@ CONFIG_MTRR=y
  CONFIG_X86_PAT=y
  CONFIG_EFI=y
  CONFIG_SECCOMP=y
+# CONFIG_CC_STACKPROTECTOR is not set
  # CONFIG_HZ_100 is not set
  # CONFIG_HZ_250 is not set
  # CONFIG_HZ_300 is not set
@@ -322,8 +343,9 @@ CONFIG_KEXEC=y
  CONFIG_CRASH_DUMP=y
  # CONFIG_KEXEC_JUMP is not set
  CONFIG_PHYSICAL_START=0x1000000
-# CONFIG_RELOCATABLE is not set
-CONFIG_PHYSICAL_ALIGN=0x200000
+CONFIG_RELOCATABLE=y
+CONFIG_X86_NEED_RELOCS=y
+CONFIG_PHYSICAL_ALIGN=0x1000000
  CONFIG_HOTPLUG_CPU=y
  # CONFIG_COMPAT_VDSO is not set
  # CONFIG_CMDLINE_BOOL is not set
@@ -363,7 +385,6 @@ CONFIG_ACPI_THERMAL=y
  CONFIG_ACPI_BLACKLIST_YEAR=0
  # CONFIG_ACPI_DEBUG is not set
  # CONFIG_ACPI_PCI_SLOT is not set
-CONFIG_ACPI_SYSTEM=y
  CONFIG_X86_PM_TIMER=y
  CONFIG_ACPI_CONTAINER=y
  # CONFIG_ACPI_SBS is not set
@@ -425,6 +446,7 @@ CONFIG_PCI_BIOS=y
  CONFIG_PCI_DIRECT=y
  CONFIG_PCI_MMCONFIG=y
  CONFIG_PCI_DOMAINS=y
+# CONFIG_DMAR is not set
  CONFIG_PCIEPORTBUS=y
  # CONFIG_HOTPLUG_PCI_PCIE is not set
  CONFIG_PCIEAER=y
@@ -435,6 +457,7 @@ CONFIG_PCI_MSI=y
  # CONFIG_PCI_DEBUG is not set
  # CONFIG_PCI_STUB is not set
  CONFIG_HT_IRQ=y
+# CONFIG_PCI_IOV is not set
  CONFIG_ISA_DMA_API=y
  # CONFIG_ISA is not set
  # CONFIG_MCA is not set
@@ -481,7 +504,6 @@ CONFIG_NET=y
  #
  # Networking options
  #
-CONFIG_COMPAT_NET_DEV_OPS=y
  CONFIG_PACKET=y
  CONFIG_PACKET_MMAP=y
  CONFIG_UNIX=y
@@ -639,6 +661,7 @@ CONFIG_LLC=y
  # CONFIG_LAPB is not set
  # CONFIG_ECONET is not set
  # CONFIG_WAN_ROUTER is not set
+# CONFIG_PHONET is not set
  CONFIG_NET_SCHED=y
  
  #
@@ -696,6 +719,7 @@ CONFIG_NET_SCH_FIFO=y
  #
  # CONFIG_NET_PKTGEN is not set
  # CONFIG_NET_TCPPROBE is not set
+# CONFIG_NET_DROP_MONITOR is not set
  CONFIG_HAMRADIO=y
  
  #
@@ -706,12 +730,10 @@ CONFIG_HAMRADIO=y
  # CONFIG_IRDA is not set
  # CONFIG_BT is not set
  # CONFIG_AF_RXRPC is not set
-# CONFIG_PHONET is not set
  CONFIG_FIB_RULES=y
  CONFIG_WIRELESS=y
  CONFIG_CFG80211=y
  # CONFIG_CFG80211_REG_DEBUG is not set
-CONFIG_NL80211=y
  CONFIG_WIRELESS_OLD_REGULATORY=y
  CONFIG_WIRELESS_EXT=y
  CONFIG_WIRELESS_EXT_SYSFS=y
@@ -789,6 +811,7 @@ CONFIG_MISC_DEVICES=y
  # CONFIG_ICS932S401 is not set
  # CONFIG_ENCLOSURE_SERVICES is not set
  # CONFIG_HP_ILO is not set
+# CONFIG_ISL29003 is not set
  # CONFIG_C2PORT is not set
  
  #
@@ -842,6 +865,7 @@ CONFIG_SCSI_SPI_ATTRS=y
  # CONFIG_SCSI_LOWLEVEL is not set
  # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
  # CONFIG_SCSI_DH is not set
+# CONFIG_SCSI_OSD_INITIATOR is not set
  CONFIG_ATA=y
  # CONFIG_ATA_NONSTANDARD is not set
  CONFIG_ATA_ACPI=y
@@ -940,6 +964,7 @@ CONFIG_DM_ZERO=y
  CONFIG_MACINTOSH_DRIVERS=y
  CONFIG_MAC_EMUMOUSEBTN=y
  CONFIG_NETDEVICES=y
+CONFIG_COMPAT_NET_DEV_OPS=y
  # CONFIG_IFB is not set
  # CONFIG_DUMMY is not set
  # CONFIG_BONDING is not set
@@ -977,6 +1002,8 @@ CONFIG_MII=y
  CONFIG_NET_VENDOR_3COM=y
  # CONFIG_VORTEX is not set
  # CONFIG_TYPHOON is not set
+# CONFIG_ETHOC is not set
+# CONFIG_DNET is not set
  CONFIG_NET_TULIP=y
  # CONFIG_DE2104X is not set
  # CONFIG_TULIP is not set
@@ -1026,6 +1053,7 @@ CONFIG_E1000=y
  CONFIG_E1000E=y
  # CONFIG_IP1000 is not set
  # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
  # CONFIG_NS83820 is not set
  # CONFIG_HAMACHI is not set
  # CONFIG_YELLOWFIN is not set
@@ -1040,6 +1068,7 @@ CONFIG_BNX2=y
  # CONFIG_QLA3XXX is not set
  # CONFIG_ATL1 is not set
  # CONFIG_ATL1E is not set
+# CONFIG_ATL1C is not set
  # CONFIG_JME is not set
  CONFIG_NETDEV_10000=y
  # CONFIG_CHELSIO_T1 is not set
@@ -1049,6 +1078,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
  # CONFIG_IXGBE is not set
  # CONFIG_IXGB is not set
  # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
  # CONFIG_MYRI10GE is not set
  # CONFIG_NETXEN_NIC is not set
  # CONFIG_NIU is not set
@@ -1058,6 +1088,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
  # CONFIG_BNX2X is not set
  # CONFIG_QLGE is not set
  # CONFIG_SFC is not set
+# CONFIG_BE2NET is not set
  CONFIG_TR=y
  # CONFIG_IBMOL is not set
  # CONFIG_IBMLS is not set
@@ -1073,8 +1104,8 @@ CONFIG_WLAN_80211=y
  # CONFIG_LIBERTAS is not set
  # CONFIG_LIBERTAS_THINFIRM is not set
  # CONFIG_AIRO is not set
-# CONFIG_HERMES is not set
  # CONFIG_ATMEL is not set
+# CONFIG_AT76C50X_USB is not set
  # CONFIG_AIRO_CS is not set
  # CONFIG_PCMCIA_WL3501 is not set
  # CONFIG_PRISM54 is not set
@@ -1084,21 +1115,21 @@ CONFIG_WLAN_80211=y
  # CONFIG_RTL8187 is not set
  # CONFIG_ADM8211 is not set
  # CONFIG_MAC80211_HWSIM is not set
+# CONFIG_MWL8K is not set
  # CONFIG_P54_COMMON is not set
  CONFIG_ATH5K=y
  # CONFIG_ATH5K_DEBUG is not set
  # CONFIG_ATH9K is not set
+# CONFIG_AR9170_USB is not set
  # CONFIG_IPW2100 is not set
  # CONFIG_IPW2200 is not set
-# CONFIG_IWLCORE is not set
-# CONFIG_IWLWIFI_LEDS is not set
-# CONFIG_IWLAGN is not set
-# CONFIG_IWL3945 is not set
+# CONFIG_IWLWIFI is not set
  # CONFIG_HOSTAP is not set
  # CONFIG_B43 is not set
  # CONFIG_B43LEGACY is not set
  # CONFIG_ZD1211RW is not set
  # CONFIG_RT2X00 is not set
+# CONFIG_HERMES is not set
  
  #
  # Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1209,6 +1240,8 @@ CONFIG_INPUT_TABLET=y
  # CONFIG_TABLET_USB_KBTAB is not set
  # CONFIG_TABLET_USB_WACOM is not set
  CONFIG_INPUT_TOUCHSCREEN=y
+# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
+# CONFIG_TOUCHSCREEN_AD7879 is not set
  # CONFIG_TOUCHSCREEN_FUJITSU is not set
  # CONFIG_TOUCHSCREEN_GUNZE is not set
  # CONFIG_TOUCHSCREEN_ELO is not set
@@ -1303,6 +1336,7 @@ CONFIG_UNIX98_PTYS=y
  # CONFIG_LEGACY_PTYS is not set
  # CONFIG_IPMI_HANDLER is not set
  CONFIG_HW_RANDOM=y
+# CONFIG_HW_RANDOM_TIMERIOMEM is not set
  CONFIG_HW_RANDOM_INTEL=y
  CONFIG_HW_RANDOM_AMD=y
  CONFIG_HW_RANDOM_GEODE=y
@@ -1390,7 +1424,6 @@ CONFIG_I2C_I801=y
  # CONFIG_SENSORS_PCF8574 is not set
  # CONFIG_PCF8575 is not set
  # CONFIG_SENSORS_PCA9539 is not set
-# CONFIG_SENSORS_PCF8591 is not set
  # CONFIG_SENSORS_MAX6875 is not set
  # CONFIG_SENSORS_TSL2550 is not set
  # CONFIG_I2C_DEBUG_CORE is not set
@@ -1424,6 +1457,7 @@ CONFIG_HWMON=y
  # CONFIG_SENSORS_ADT7475 is not set
  # CONFIG_SENSORS_K8TEMP is not set
  # CONFIG_SENSORS_ASB100 is not set
+# CONFIG_SENSORS_ATK0110 is not set
  # CONFIG_SENSORS_ATXP1 is not set
  # CONFIG_SENSORS_DS1621 is not set
  # CONFIG_SENSORS_I5K_AMB is not set
@@ -1433,6 +1467,7 @@ CONFIG_HWMON=y
  # CONFIG_SENSORS_FSCHER is not set
  # CONFIG_SENSORS_FSCPOS is not set
  # CONFIG_SENSORS_FSCHMD is not set
+# CONFIG_SENSORS_G760A is not set
  # CONFIG_SENSORS_GL518SM is not set
  # CONFIG_SENSORS_GL520SM is not set
  # CONFIG_SENSORS_CORETEMP is not set
@@ -1448,11 +1483,14 @@ CONFIG_HWMON=y
  # CONFIG_SENSORS_LM90 is not set
  # CONFIG_SENSORS_LM92 is not set
  # CONFIG_SENSORS_LM93 is not set
+# CONFIG_SENSORS_LTC4215 is not set
  # CONFIG_SENSORS_LTC4245 is not set
+# CONFIG_SENSORS_LM95241 is not set
  # CONFIG_SENSORS_MAX1619 is not set
  # CONFIG_SENSORS_MAX6650 is not set
  # CONFIG_SENSORS_PC87360 is not set
  # CONFIG_SENSORS_PC87427 is not set
+# CONFIG_SENSORS_PCF8591 is not set
  # CONFIG_SENSORS_SIS5595 is not set
  # CONFIG_SENSORS_DME1737 is not set
  # CONFIG_SENSORS_SMSC47M1 is not set
@@ -1643,7 +1681,6 @@ CONFIG_FB_EFI=y
  # CONFIG_FB_3DFX is not set
  # CONFIG_FB_VOODOO1 is not set
  # CONFIG_FB_VT8623 is not set
-# CONFIG_FB_CYBLA is not set
  # CONFIG_FB_TRIDENT is not set
  # CONFIG_FB_ARK is not set
  # CONFIG_FB_PM3 is not set
@@ -1652,6 +1689,7 @@ CONFIG_FB_EFI=y
  # CONFIG_FB_VIRTUAL is not set
  # CONFIG_FB_METRONOME is not set
  # CONFIG_FB_MB862XX is not set
+# CONFIG_FB_BROADSHEET is not set
  CONFIG_BACKLIGHT_LCD_SUPPORT=y
  # CONFIG_LCD_CLASS_DEVICE is not set
  CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1738,6 +1776,8 @@ CONFIG_SND_PCI=y
  # CONFIG_SND_INDIGO is not set
  # CONFIG_SND_INDIGOIO is not set
  # CONFIG_SND_INDIGODJ is not set
+# CONFIG_SND_INDIGOIOX is not set
+# CONFIG_SND_INDIGODJX is not set
  # CONFIG_SND_EMU10K1 is not set
  # CONFIG_SND_EMU10K1X is not set
  # CONFIG_SND_ENS1370 is not set
@@ -1811,15 +1851,17 @@ CONFIG_USB_HIDDEV=y
  #
  # Special HID drivers
  #
-CONFIG_HID_COMPAT=y
  CONFIG_HID_A4TECH=y
  CONFIG_HID_APPLE=y
  CONFIG_HID_BELKIN=y
  CONFIG_HID_CHERRY=y
  CONFIG_HID_CHICONY=y
  CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
  CONFIG_HID_EZKEY=y
+CONFIG_HID_KYE=y
  CONFIG_HID_GYRATION=y
+CONFIG_HID_KENSINGTON=y
  CONFIG_HID_LOGITECH=y
  CONFIG_LOGITECH_FF=y
  # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1885,11 +1927,11 @@ CONFIG_USB_PRINTER=y
  # CONFIG_USB_TMC is not set
  
  #
-# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed;
+# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
  #
  
  #
-# see USB_STORAGE Help for more information
+# also be needed; see USB_STORAGE Help for more info
  #
  CONFIG_USB_STORAGE=y
  # CONFIG_USB_STORAGE_DEBUG is not set
@@ -1931,7 +1973,6 @@ CONFIG_USB_LIBUSUAL=y
  # CONFIG_USB_LED is not set
  # CONFIG_USB_CYPRESS_CY7C63 is not set
  # CONFIG_USB_CYTHERM is not set
-# CONFIG_USB_PHIDGET is not set
  # CONFIG_USB_IDMOUSE is not set
  # CONFIG_USB_FTDI_ELAN is not set
  # CONFIG_USB_APPLEDISPLAY is not set
@@ -1947,6 +1988,7 @@ CONFIG_USB_LIBUSUAL=y
  #
  # OTG and related infrastructure
  #
+# CONFIG_NOP_USB_XCEIV is not set
  # CONFIG_UWB is not set
  # CONFIG_MMC is not set
  # CONFIG_MEMSTICK is not set
@@ -1958,8 +2000,10 @@ CONFIG_LEDS_CLASS=y
  #
  # CONFIG_LEDS_ALIX2 is not set
  # CONFIG_LEDS_PCA9532 is not set
+# CONFIG_LEDS_LP5521 is not set
  # CONFIG_LEDS_CLEVO_MAIL is not set
  # CONFIG_LEDS_PCA955X is not set
+# CONFIG_LEDS_BD2802 is not set
  
  #
  # LED Triggers
@@ -1969,6 +2013,10 @@ CONFIG_LEDS_TRIGGERS=y
  # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
  # CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
  # CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
+
+#
+# iptables trigger is under Netfilter config (LED target)
+#
  # CONFIG_ACCESSIBILITY is not set
  # CONFIG_INFINIBAND is not set
  CONFIG_EDAC=y
@@ -2037,6 +2085,7 @@ CONFIG_DMADEVICES=y
  # DMA Devices
  #
  # CONFIG_INTEL_IOATDMA is not set
+# CONFIG_AUXDISPLAY is not set
  # CONFIG_UIO is not set
  # CONFIG_STAGING is not set
  CONFIG_X86_PLATFORM_DEVICES=y
@@ -2071,6 +2120,7 @@ CONFIG_DMIID=y
  #
  # CONFIG_EXT2_FS is not set
  CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
  CONFIG_EXT3_FS_XATTR=y
  CONFIG_EXT3_FS_POSIX_ACL=y
  CONFIG_EXT3_FS_SECURITY=y
@@ -2100,6 +2150,11 @@ CONFIG_AUTOFS4_FS=y
  # CONFIG_FUSE_FS is not set
  CONFIG_GENERIC_ACL=y
  
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
  #
  # CD-ROM/DVD Filesystems
  #
@@ -2151,6 +2206,7 @@ CONFIG_MISC_FILESYSTEMS=y
  # CONFIG_ROMFS_FS is not set
  # CONFIG_SYSV_FS is not set
  # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
  CONFIG_NETWORK_FILESYSTEMS=y
  CONFIG_NFS_FS=y
  CONFIG_NFS_V3=y
@@ -2164,7 +2220,6 @@ CONFIG_NFS_ACL_SUPPORT=y
  CONFIG_NFS_COMMON=y
  CONFIG_SUNRPC=y
  CONFIG_SUNRPC_GSS=y
-# CONFIG_SUNRPC_REGISTER_V4 is not set
  CONFIG_RPCSEC_GSS_KRB5=y
  # CONFIG_RPCSEC_GSS_SPKM3 is not set
  # CONFIG_SMB_FS is not set
@@ -2251,6 +2306,7 @@ CONFIG_DEBUG_FS=y
  CONFIG_DEBUG_KERNEL=y
  # CONFIG_DEBUG_SHIRQ is not set
  # CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
  # CONFIG_SCHED_DEBUG is not set
  CONFIG_SCHEDSTATS=y
  CONFIG_TIMER_STATS=y
@@ -2266,6 +2322,7 @@ CONFIG_TIMER_STATS=y
  # CONFIG_LOCK_STAT is not set
  # CONFIG_DEBUG_SPINLOCK_SLEEP is not set
  # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
+CONFIG_STACKTRACE=y
  # CONFIG_DEBUG_KOBJECT is not set
  # CONFIG_DEBUG_HIGHMEM is not set
  CONFIG_DEBUG_BUGVERBOSE=y
@@ -2289,13 +2346,19 @@ CONFIG_FRAME_POINTER=y
  # CONFIG_FAULT_INJECTION is not set
  # CONFIG_LATENCYTOP is not set
  CONFIG_SYSCTL_SYSCALL_CHECK=y
+# CONFIG_DEBUG_PAGEALLOC is not set
  CONFIG_USER_STACKTRACE_SUPPORT=y
+CONFIG_NOP_TRACER=y
  CONFIG_HAVE_FUNCTION_TRACER=y
  CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
  CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
  CONFIG_HAVE_DYNAMIC_FTRACE=y
  CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
  CONFIG_HAVE_HW_BRANCH_TRACER=y
+CONFIG_HAVE_FTRACE_SYSCALLS=y
+CONFIG_RING_BUFFER=y
+CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
  
  #
  # Tracers
@@ -2305,13 +2368,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
  # CONFIG_SYSPROF_TRACER is not set
  # CONFIG_SCHED_TRACER is not set
  # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_FTRACE_SYSCALLS is not set
  # CONFIG_BOOT_TRACER is not set
  # CONFIG_TRACE_BRANCH_PROFILING is not set
  # CONFIG_POWER_TRACER is not set
  # CONFIG_STACK_TRACER is not set
  # CONFIG_HW_BRANCH_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+CONFIG_BLK_DEV_IO_TRACE=y
+# CONFIG_FTRACE_STARTUP_TEST is not set
+# CONFIG_MMIOTRACE is not set
  CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
-# CONFIG_DYNAMIC_PRINTK_DEBUG is not set
+# CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
  # CONFIG_SAMPLES is not set
  CONFIG_HAVE_ARCH_KGDB=y
  # CONFIG_KGDB is not set
@@ -2321,7 +2392,6 @@ CONFIG_EARLY_PRINTK=y
  CONFIG_EARLY_PRINTK_DBGP=y
  CONFIG_DEBUG_STACKOVERFLOW=y
  CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_DEBUG_PAGEALLOC is not set
  # CONFIG_DEBUG_PER_CPU_MAPS is not set
  # CONFIG_X86_PTDUMP is not set
  CONFIG_DEBUG_RODATA=y
@@ -2329,7 +2399,7 @@ CONFIG_DEBUG_RODATA=y
  CONFIG_DEBUG_NX_TEST=m
  # CONFIG_4KSTACKS is not set
  CONFIG_DOUBLEFAULT=y
-# CONFIG_MMIOTRACE is not set
+CONFIG_HAVE_MMIOTRACE_SUPPORT=y
  CONFIG_IO_DELAY_TYPE_0X80=0
  CONFIG_IO_DELAY_TYPE_0XED=1
  CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2365,6 +2435,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
  CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
  # CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
  # CONFIG_SECURITY_SMACK is not set
+# CONFIG_SECURITY_TOMOYO is not set
+# CONFIG_IMA is not set
  CONFIG_CRYPTO=y
  
  #
@@ -2380,10 +2452,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
  CONFIG_CRYPTO_HASH=y
  CONFIG_CRYPTO_HASH2=y
  CONFIG_CRYPTO_RNG2=y
+CONFIG_CRYPTO_PCOMP=y
  CONFIG_CRYPTO_MANAGER=y
  CONFIG_CRYPTO_MANAGER2=y
  # CONFIG_CRYPTO_GF128MUL is not set
  # CONFIG_CRYPTO_NULL is not set
+CONFIG_CRYPTO_WORKQUEUE=y
  # CONFIG_CRYPTO_CRYPTD is not set
  CONFIG_CRYPTO_AUTHENC=y
  # CONFIG_CRYPTO_TEST is not set
@@ -2456,6 +2530,7 @@ CONFIG_CRYPTO_DES=y
  # Compression
  #
  # CONFIG_CRYPTO_DEFLATE is not set
+# CONFIG_CRYPTO_ZLIB is not set
  # CONFIG_CRYPTO_LZO is not set
  
  #
@@ -2467,11 +2542,13 @@ CONFIG_CRYPTO_HW=y
  # CONFIG_CRYPTO_DEV_GEODE is not set
  # CONFIG_CRYPTO_DEV_HIFN_795X is not set
  CONFIG_HAVE_KVM=y
+CONFIG_HAVE_KVM_IRQCHIP=y
  CONFIG_VIRTUALIZATION=y
  # CONFIG_KVM is not set
  # CONFIG_LGUEST is not set
  # CONFIG_VIRTIO_PCI is not set
  # CONFIG_VIRTIO_BALLOON is not set
+CONFIG_BINARY_PRINTF=y
  
  #
  # Library routines
@@ -2489,7 +2566,10 @@ CONFIG_CRC32=y
  # CONFIG_LIBCRC32C is not set
  CONFIG_AUDIT_GENERIC=y
  CONFIG_ZLIB_INFLATE=y
-CONFIG_PLIST=y
+CONFIG_DECOMPRESS_GZIP=y
+CONFIG_DECOMPRESS_BZIP2=y
+CONFIG_DECOMPRESS_LZMA=y
  CONFIG_HAS_IOMEM=y
  CONFIG_HAS_IOPORT=y
  CONFIG_HAS_DMA=y
+CONFIG_NLATTR=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig

index 9fe5d212ab4cc8291e0c55ebdc619a57c85f2d64..cee1dd2e69b2e173c5218365312e73332652337f 100644 (file)
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,12 +1,13 @@
  #
  # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29-rc4
-# Tue Feb 24 15:44:16 2009
+# Linux kernel version: 2.6.30-rc2
+# Mon May 11 16:22:00 2009
  #
  CONFIG_64BIT=y
  # CONFIG_X86_32 is not set
  CONFIG_X86_64=y
  CONFIG_X86=y
+CONFIG_OUTPUT_FORMAT="elf64-x86-64"
  CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
  CONFIG_GENERIC_TIME=y
  CONFIG_GENERIC_CMOS_UPDATE=y
@@ -34,6 +35,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
  CONFIG_ARCH_HAS_DEFAULT_IDLE=y
  CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
  CONFIG_HAVE_SETUP_PER_CPU_AREA=y
+CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
  CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y
  CONFIG_ARCH_HIBERNATION_POSSIBLE=y
  CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -41,14 +43,14 @@ CONFIG_ZONE_DMA32=y
  CONFIG_ARCH_POPULATES_NODE_MAP=y
  CONFIG_AUDIT_ARCH=y
  CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
+CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
  CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
  CONFIG_GENERIC_IRQ_PROBE=y
  CONFIG_GENERIC_PENDING_IRQ=y
-CONFIG_X86_SMP=y
  CONFIG_USE_GENERIC_SMP_HELPERS=y
  CONFIG_X86_64_SMP=y
  CONFIG_X86_HT=y
-CONFIG_X86_BIOS_REBOOT=y
  CONFIG_X86_TRAMPOLINE=y
  # CONFIG_KTIME_SCALAR is not set
  CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
@@ -61,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
  CONFIG_INIT_ENV_ARG_LIMIT=32
  CONFIG_LOCALVERSION=""
  # CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_HAVE_KERNEL_GZIP=y
+CONFIG_HAVE_KERNEL_BZIP2=y
+CONFIG_HAVE_KERNEL_LZMA=y
+CONFIG_KERNEL_GZIP=y
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
  CONFIG_SWAP=y
  CONFIG_SYSVIPC=y
  CONFIG_SYSVIPC_SYSCTL=y
  CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
  CONFIG_BSD_PROCESS_ACCT=y
  # CONFIG_BSD_PROCESS_ACCT_V3 is not set
  CONFIG_TASKSTATS=y
@@ -114,23 +123,26 @@ CONFIG_PID_NS=y
  CONFIG_NET_NS=y
  CONFIG_BLK_DEV_INITRD=y
  CONFIG_INITRAMFS_SOURCE=""
+CONFIG_RD_GZIP=y
+CONFIG_RD_BZIP2=y
+CONFIG_RD_LZMA=y
  CONFIG_CC_OPTIMIZE_FOR_SIZE=y
  CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
  # CONFIG_EMBEDDED is not set
  CONFIG_UID16=y
  CONFIG_SYSCTL_SYSCALL=y
  CONFIG_KALLSYMS=y
  CONFIG_KALLSYMS_ALL=y
  CONFIG_KALLSYMS_EXTRA_PASS=y
+# CONFIG_STRIP_ASM_SYMS is not set
  CONFIG_HOTPLUG=y
  CONFIG_PRINTK=y
  CONFIG_BUG=y
  CONFIG_ELF_CORE=y
  CONFIG_PCSPKR_PLATFORM=y
-# CONFIG_COMPAT_BRK is not set
  CONFIG_BASE_FULL=y
  CONFIG_FUTEX=y
-CONFIG_ANON_INODES=y
  CONFIG_EPOLL=y
  CONFIG_SIGNALFD=y
  CONFIG_TIMERFD=y
@@ -140,6 +152,7 @@ CONFIG_AIO=y
  CONFIG_VM_EVENT_COUNTERS=y
  CONFIG_PCI_QUIRKS=y
  CONFIG_SLUB_DEBUG=y
+# CONFIG_COMPAT_BRK is not set
  # CONFIG_SLAB is not set
  CONFIG_SLUB=y
  # CONFIG_SLOB is not set
@@ -155,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
  CONFIG_HAVE_KPROBES=y
  CONFIG_HAVE_KRETPROBES=y
  CONFIG_HAVE_ARCH_TRACEHOOK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
  # CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
  CONFIG_SLABINFO=y
  CONFIG_RT_MUTEXES=y
@@ -167,7 +182,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
  # CONFIG_MODULE_SRCVERSION_ALL is not set
  CONFIG_STOP_MACHINE=y
  CONFIG_BLOCK=y
-CONFIG_BLK_DEV_IO_TRACE=y
  CONFIG_BLK_DEV_BSG=y
  # CONFIG_BLK_DEV_INTEGRITY is not set
  CONFIG_BLOCK_COMPAT=y
@@ -195,12 +209,10 @@ CONFIG_HIGH_RES_TIMERS=y
  CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
  CONFIG_SMP=y
  CONFIG_SPARSE_IRQ=y
-# CONFIG_NUMA_MIGRATE_IRQ_DESC is not set
-CONFIG_X86_FIND_SMP_CONFIG=y
  CONFIG_X86_MPPARSE=y
-# CONFIG_X86_ELAN is not set
-# CONFIG_X86_GENERICARCH is not set
+CONFIG_X86_EXTENDED_PLATFORM=y
  # CONFIG_X86_VSMP is not set
+# CONFIG_X86_UV is not set
  CONFIG_SCHED_OMIT_FRAME_POINTER=y
  # CONFIG_PARAVIRT_GUEST is not set
  # CONFIG_MEMTEST is not set
@@ -230,10 +242,10 @@ CONFIG_SCHED_OMIT_FRAME_POINTER=y
  # CONFIG_MCORE2 is not set
  CONFIG_GENERIC_CPU=y
  CONFIG_X86_CPU=y
-CONFIG_X86_L1_CACHE_BYTES=128
-CONFIG_X86_INTERNODE_CACHE_BYTES=128
+CONFIG_X86_L1_CACHE_BYTES=64
+CONFIG_X86_INTERNODE_CACHE_BYTES=64
  CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_L1_CACHE_SHIFT=6
  CONFIG_X86_WP_WORKS_OK=y
  CONFIG_X86_TSC=y
  CONFIG_X86_CMPXCHG64=y
@@ -242,7 +254,7 @@ CONFIG_X86_MINIMUM_CPU_FAMILY=64
  CONFIG_X86_DEBUGCTLMSR=y
  CONFIG_CPU_SUP_INTEL=y
  CONFIG_CPU_SUP_AMD=y
-CONFIG_CPU_SUP_CENTAUR_64=y
+CONFIG_CPU_SUP_CENTAUR=y
  CONFIG_X86_DS=y
  CONFIG_X86_PTRACE_BTS=y
  CONFIG_HPET_TIMER=y
@@ -269,6 +281,7 @@ CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
  CONFIG_X86_MCE=y
  CONFIG_X86_MCE_INTEL=y
  CONFIG_X86_MCE_AMD=y
+CONFIG_X86_MCE_THRESHOLD=y
  # CONFIG_I8K is not set
  CONFIG_MICROCODE=y
  CONFIG_MICROCODE_INTEL=y
@@ -276,6 +289,7 @@ CONFIG_MICROCODE_AMD=y
  CONFIG_MICROCODE_OLD_INTERFACE=y
  CONFIG_X86_MSR=y
  CONFIG_X86_CPUID=y
+# CONFIG_X86_CPU_DEBUG is not set
  CONFIG_ARCH_PHYS_ADDR_T_64BIT=y
  CONFIG_DIRECT_GBPAGES=y
  CONFIG_NUMA=y
@@ -309,6 +323,8 @@ CONFIG_ZONE_DMA_FLAG=1
  CONFIG_BOUNCE=y
  CONFIG_VIRT_TO_BUS=y
  CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
  CONFIG_X86_CHECK_BIOS_CORRUPTION=y
  CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
  CONFIG_X86_RESERVE_LOW_64K=y
@@ -317,6 +333,7 @@ CONFIG_MTRR=y
  CONFIG_X86_PAT=y
  CONFIG_EFI=y
  CONFIG_SECCOMP=y
+# CONFIG_CC_STACKPROTECTOR is not set
  # CONFIG_HZ_100 is not set
  # CONFIG_HZ_250 is not set
  # CONFIG_HZ_300 is not set
@@ -325,9 +342,10 @@ CONFIG_HZ=1000
  CONFIG_SCHED_HRTICK=y
  CONFIG_KEXEC=y
  CONFIG_CRASH_DUMP=y
+# CONFIG_KEXEC_JUMP is not set
  CONFIG_PHYSICAL_START=0x1000000
-# CONFIG_RELOCATABLE is not set
-CONFIG_PHYSICAL_ALIGN=0x200000
+CONFIG_RELOCATABLE=y
+CONFIG_PHYSICAL_ALIGN=0x1000000
  CONFIG_HOTPLUG_CPU=y
  # CONFIG_COMPAT_VDSO is not set
  # CONFIG_CMDLINE_BOOL is not set
@@ -370,7 +388,6 @@ CONFIG_ACPI_NUMA=y
  CONFIG_ACPI_BLACKLIST_YEAR=0
  # CONFIG_ACPI_DEBUG is not set
  # CONFIG_ACPI_PCI_SLOT is not set
-CONFIG_ACPI_SYSTEM=y
  CONFIG_X86_PM_TIMER=y
  CONFIG_ACPI_CONTAINER=y
  # CONFIG_ACPI_SBS is not set
@@ -436,6 +453,7 @@ CONFIG_PCI_MSI=y
  # CONFIG_PCI_DEBUG is not set
  # CONFIG_PCI_STUB is not set
  CONFIG_HT_IRQ=y
+# CONFIG_PCI_IOV is not set
  CONFIG_ISA_DMA_API=y
  CONFIG_K8_NB=y
  CONFIG_PCCARD=y
@@ -481,7 +499,6 @@ CONFIG_NET=y
  #
  # Networking options
  #
-CONFIG_COMPAT_NET_DEV_OPS=y
  CONFIG_PACKET=y
  CONFIG_PACKET_MMAP=y
  CONFIG_UNIX=y
@@ -639,6 +656,7 @@ CONFIG_LLC=y
  # CONFIG_LAPB is not set
  # CONFIG_ECONET is not set
  # CONFIG_WAN_ROUTER is not set
+# CONFIG_PHONET is not set
  CONFIG_NET_SCHED=y
  
  #
@@ -696,6 +714,7 @@ CONFIG_NET_SCH_FIFO=y
  #
  # CONFIG_NET_PKTGEN is not set
  # CONFIG_NET_TCPPROBE is not set
+# CONFIG_NET_DROP_MONITOR is not set
  CONFIG_HAMRADIO=y
  
  #
@@ -706,12 +725,10 @@ CONFIG_HAMRADIO=y
  # CONFIG_IRDA is not set
  # CONFIG_BT is not set
  # CONFIG_AF_RXRPC is not set
-# CONFIG_PHONET is not set
  CONFIG_FIB_RULES=y
  CONFIG_WIRELESS=y
  CONFIG_CFG80211=y
  # CONFIG_CFG80211_REG_DEBUG is not set
-CONFIG_NL80211=y
  CONFIG_WIRELESS_OLD_REGULATORY=y
  CONFIG_WIRELESS_EXT=y
  CONFIG_WIRELESS_EXT_SYSFS=y
@@ -788,9 +805,8 @@ CONFIG_MISC_DEVICES=y
  # CONFIG_TIFM_CORE is not set
  # CONFIG_ICS932S401 is not set
  # CONFIG_ENCLOSURE_SERVICES is not set
-# CONFIG_SGI_XP is not set
  # CONFIG_HP_ILO is not set
-# CONFIG_SGI_GRU is not set
+# CONFIG_ISL29003 is not set
  # CONFIG_C2PORT is not set
  
  #
@@ -844,6 +860,7 @@ CONFIG_SCSI_SPI_ATTRS=y
  # CONFIG_SCSI_LOWLEVEL is not set
  # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
  # CONFIG_SCSI_DH is not set
+# CONFIG_SCSI_OSD_INITIATOR is not set
  CONFIG_ATA=y
  # CONFIG_ATA_NONSTANDARD is not set
  CONFIG_ATA_ACPI=y
@@ -940,6 +957,7 @@ CONFIG_DM_ZERO=y
  CONFIG_MACINTOSH_DRIVERS=y
  CONFIG_MAC_EMUMOUSEBTN=y
  CONFIG_NETDEVICES=y
+CONFIG_COMPAT_NET_DEV_OPS=y
  # CONFIG_IFB is not set
  # CONFIG_DUMMY is not set
  # CONFIG_BONDING is not set
@@ -977,6 +995,8 @@ CONFIG_MII=y
  CONFIG_NET_VENDOR_3COM=y
  # CONFIG_VORTEX is not set
  # CONFIG_TYPHOON is not set
+# CONFIG_ETHOC is not set
+# CONFIG_DNET is not set
  CONFIG_NET_TULIP=y
  # CONFIG_DE2104X is not set
  # CONFIG_TULIP is not set
@@ -1026,6 +1046,7 @@ CONFIG_E1000=y
  # CONFIG_E1000E is not set
  # CONFIG_IP1000 is not set
  # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
  # CONFIG_NS83820 is not set
  # CONFIG_HAMACHI is not set
  # CONFIG_YELLOWFIN is not set
@@ -1040,6 +1061,7 @@ CONFIG_TIGON3=y
  # CONFIG_QLA3XXX is not set
  # CONFIG_ATL1 is not set
  # CONFIG_ATL1E is not set
+# CONFIG_ATL1C is not set
  # CONFIG_JME is not set
  CONFIG_NETDEV_10000=y
  # CONFIG_CHELSIO_T1 is not set
@@ -1049,6 +1071,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
  # CONFIG_IXGBE is not set
  # CONFIG_IXGB is not set
  # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
  # CONFIG_MYRI10GE is not set
  # CONFIG_NETXEN_NIC is not set
  # CONFIG_NIU is not set
@@ -1058,6 +1081,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
  # CONFIG_BNX2X is not set
  # CONFIG_QLGE is not set
  # CONFIG_SFC is not set
+# CONFIG_BE2NET is not set
  CONFIG_TR=y
  # CONFIG_IBMOL is not set
  # CONFIG_3C359 is not set
@@ -1072,8 +1096,8 @@ CONFIG_WLAN_80211=y
  # CONFIG_LIBERTAS is not set
  # CONFIG_LIBERTAS_THINFIRM is not set
  # CONFIG_AIRO is not set
-# CONFIG_HERMES is not set
  # CONFIG_ATMEL is not set
+# CONFIG_AT76C50X_USB is not set
  # CONFIG_AIRO_CS is not set
  # CONFIG_PCMCIA_WL3501 is not set
  # CONFIG_PRISM54 is not set
@@ -1083,21 +1107,21 @@ CONFIG_WLAN_80211=y
  # CONFIG_RTL8187 is not set
  # CONFIG_ADM8211 is not set
  # CONFIG_MAC80211_HWSIM is not set
+# CONFIG_MWL8K is not set
  # CONFIG_P54_COMMON is not set
  CONFIG_ATH5K=y
  # CONFIG_ATH5K_DEBUG is not set
  # CONFIG_ATH9K is not set
+# CONFIG_AR9170_USB is not set
  # CONFIG_IPW2100 is not set
  # CONFIG_IPW2200 is not set
-# CONFIG_IWLCORE is not set
-# CONFIG_IWLWIFI_LEDS is not set
-# CONFIG_IWLAGN is not set
-# CONFIG_IWL3945 is not set
+# CONFIG_IWLWIFI is not set
  # CONFIG_HOSTAP is not set
  # CONFIG_B43 is not set
  # CONFIG_B43LEGACY is not set
  # CONFIG_ZD1211RW is not set
  # CONFIG_RT2X00 is not set
+# CONFIG_HERMES is not set
  
  #
  # Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1208,6 +1232,8 @@ CONFIG_INPUT_TABLET=y
  # CONFIG_TABLET_USB_KBTAB is not set
  # CONFIG_TABLET_USB_WACOM is not set
  CONFIG_INPUT_TOUCHSCREEN=y
+# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
+# CONFIG_TOUCHSCREEN_AD7879 is not set
  # CONFIG_TOUCHSCREEN_FUJITSU is not set
  # CONFIG_TOUCHSCREEN_GUNZE is not set
  # CONFIG_TOUCHSCREEN_ELO is not set
@@ -1301,6 +1327,7 @@ CONFIG_UNIX98_PTYS=y
  # CONFIG_LEGACY_PTYS is not set
  # CONFIG_IPMI_HANDLER is not set
  CONFIG_HW_RANDOM=y
+# CONFIG_HW_RANDOM_TIMERIOMEM is not set
  # CONFIG_HW_RANDOM_INTEL is not set
  # CONFIG_HW_RANDOM_AMD is not set
  CONFIG_NVRAM=y
@@ -1382,7 +1409,6 @@ CONFIG_I2C_I801=y
  # CONFIG_SENSORS_PCF8574 is not set
  # CONFIG_PCF8575 is not set
  # CONFIG_SENSORS_PCA9539 is not set
-# CONFIG_SENSORS_PCF8591 is not set
  # CONFIG_SENSORS_MAX6875 is not set
  # CONFIG_SENSORS_TSL2550 is not set
  # CONFIG_I2C_DEBUG_CORE is not set
@@ -1416,6 +1442,7 @@ CONFIG_HWMON=y
  # CONFIG_SENSORS_ADT7475 is not set
  # CONFIG_SENSORS_K8TEMP is not set
  # CONFIG_SENSORS_ASB100 is not set
+# CONFIG_SENSORS_ATK0110 is not set
  # CONFIG_SENSORS_ATXP1 is not set
  # CONFIG_SENSORS_DS1621 is not set
  # CONFIG_SENSORS_I5K_AMB is not set
@@ -1425,6 +1452,7 @@ CONFIG_HWMON=y
  # CONFIG_SENSORS_FSCHER is not set
  # CONFIG_SENSORS_FSCPOS is not set
  # CONFIG_SENSORS_FSCHMD is not set
+# CONFIG_SENSORS_G760A is not set
  # CONFIG_SENSORS_GL518SM is not set
  # CONFIG_SENSORS_GL520SM is not set
  # CONFIG_SENSORS_CORETEMP is not set
@@ -1440,11 +1468,14 @@ CONFIG_HWMON=y
  # CONFIG_SENSORS_LM90 is not set
  # CONFIG_SENSORS_LM92 is not set
  # CONFIG_SENSORS_LM93 is not set
+# CONFIG_SENSORS_LTC4215 is not set
  # CONFIG_SENSORS_LTC4245 is not set
+# CONFIG_SENSORS_LM95241 is not set
  # CONFIG_SENSORS_MAX1619 is not set
  # CONFIG_SENSORS_MAX6650 is not set
  # CONFIG_SENSORS_PC87360 is not set
  # CONFIG_SENSORS_PC87427 is not set
+# CONFIG_SENSORS_PCF8591 is not set
  # CONFIG_SENSORS_SIS5595 is not set
  # CONFIG_SENSORS_DME1737 is not set
  # CONFIG_SENSORS_SMSC47M1 is not set
@@ -1635,6 +1666,7 @@ CONFIG_FB_EFI=y
  # CONFIG_FB_VIRTUAL is not set
  # CONFIG_FB_METRONOME is not set
  # CONFIG_FB_MB862XX is not set
+# CONFIG_FB_BROADSHEET is not set
  CONFIG_BACKLIGHT_LCD_SUPPORT=y
  # CONFIG_LCD_CLASS_DEVICE is not set
  CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1720,6 +1752,8 @@ CONFIG_SND_PCI=y
  # CONFIG_SND_INDIGO is not set
  # CONFIG_SND_INDIGOIO is not set
  # CONFIG_SND_INDIGODJ is not set
+# CONFIG_SND_INDIGOIOX is not set
+# CONFIG_SND_INDIGODJX is not set
  # CONFIG_SND_EMU10K1 is not set
  # CONFIG_SND_EMU10K1X is not set
  # CONFIG_SND_ENS1370 is not set
@@ -1792,15 +1826,17 @@ CONFIG_USB_HIDDEV=y
  #
  # Special HID drivers
  #
-CONFIG_HID_COMPAT=y
  CONFIG_HID_A4TECH=y
  CONFIG_HID_APPLE=y
  CONFIG_HID_BELKIN=y
  CONFIG_HID_CHERRY=y
  CONFIG_HID_CHICONY=y
  CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
  CONFIG_HID_EZKEY=y
+CONFIG_HID_KYE=y
  CONFIG_HID_GYRATION=y
+CONFIG_HID_KENSINGTON=y
  CONFIG_HID_LOGITECH=y
  CONFIG_LOGITECH_FF=y
  # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1866,11 +1902,11 @@ CONFIG_USB_PRINTER=y
  # CONFIG_USB_TMC is not set
  
  #
-# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed;
+# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
  #
  
  #
-# see USB_STORAGE Help for more information
+# also be needed; see USB_STORAGE Help for more info
  #
  CONFIG_USB_STORAGE=y
  # CONFIG_USB_STORAGE_DEBUG is not set
@@ -1912,7 +1948,6 @@ CONFIG_USB_LIBUSUAL=y
  # CONFIG_USB_LED is not set
  # CONFIG_USB_CYPRESS_CY7C63 is not set
  # CONFIG_USB_CYTHERM is not set
-# CONFIG_USB_PHIDGET is not set
  # CONFIG_USB_IDMOUSE is not set
  # CONFIG_USB_FTDI_ELAN is not set
  # CONFIG_USB_APPLEDISPLAY is not set
@@ -1928,6 +1963,7 @@ CONFIG_USB_LIBUSUAL=y
  #
  # OTG and related infrastructure
  #
+# CONFIG_NOP_USB_XCEIV is not set
  # CONFIG_UWB is not set
  # CONFIG_MMC is not set
  # CONFIG_MEMSTICK is not set
@@ -1939,8 +1975,10 @@ CONFIG_LEDS_CLASS=y
  #
  # CONFIG_LEDS_ALIX2 is not set
  # CONFIG_LEDS_PCA9532 is not set
+# CONFIG_LEDS_LP5521 is not set
  # CONFIG_LEDS_CLEVO_MAIL is not set
  # CONFIG_LEDS_PCA955X is not set
+# CONFIG_LEDS_BD2802 is not set
  
  #
  # LED Triggers
@@ -1950,6 +1988,10 @@ CONFIG_LEDS_TRIGGERS=y
  # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
  # CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
  # CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
+
+#
+# iptables trigger is under Netfilter config (LED target)
+#
  # CONFIG_ACCESSIBILITY is not set
  # CONFIG_INFINIBAND is not set
  CONFIG_EDAC=y
@@ -2018,6 +2060,7 @@ CONFIG_DMADEVICES=y
  # DMA Devices
  #
  # CONFIG_INTEL_IOATDMA is not set
+# CONFIG_AUXDISPLAY is not set
  # CONFIG_UIO is not set
  # CONFIG_STAGING is not set
  CONFIG_X86_PLATFORM_DEVICES=y
@@ -2051,6 +2094,7 @@ CONFIG_DMIID=y
  #
  # CONFIG_EXT2_FS is not set
  CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
  CONFIG_EXT3_FS_XATTR=y
  CONFIG_EXT3_FS_POSIX_ACL=y
  CONFIG_EXT3_FS_SECURITY=y
@@ -2081,6 +2125,11 @@ CONFIG_AUTOFS4_FS=y
  # CONFIG_FUSE_FS is not set
  CONFIG_GENERIC_ACL=y
  
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
  #
  # CD-ROM/DVD Filesystems
  #
@@ -2132,6 +2181,7 @@ CONFIG_MISC_FILESYSTEMS=y
  # CONFIG_ROMFS_FS is not set
  # CONFIG_SYSV_FS is not set
  # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
  CONFIG_NETWORK_FILESYSTEMS=y
  CONFIG_NFS_FS=y
  CONFIG_NFS_V3=y
@@ -2145,7 +2195,6 @@ CONFIG_NFS_ACL_SUPPORT=y
  CONFIG_NFS_COMMON=y
  CONFIG_SUNRPC=y
  CONFIG_SUNRPC_GSS=y
-# CONFIG_SUNRPC_REGISTER_V4 is not set
  CONFIG_RPCSEC_GSS_KRB5=y
  # CONFIG_RPCSEC_GSS_SPKM3 is not set
  # CONFIG_SMB_FS is not set
@@ -2232,6 +2281,7 @@ CONFIG_DEBUG_FS=y
  CONFIG_DEBUG_KERNEL=y
  # CONFIG_DEBUG_SHIRQ is not set
  # CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
  # CONFIG_SCHED_DEBUG is not set
  CONFIG_SCHEDSTATS=y
  CONFIG_TIMER_STATS=y
@@ -2247,6 +2297,7 @@ CONFIG_TIMER_STATS=y
  # CONFIG_LOCK_STAT is not set
  # CONFIG_DEBUG_SPINLOCK_SLEEP is not set
  # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
+CONFIG_STACKTRACE=y
  # CONFIG_DEBUG_KOBJECT is not set
  CONFIG_DEBUG_BUGVERBOSE=y
  # CONFIG_DEBUG_INFO is not set
@@ -2269,13 +2320,19 @@ CONFIG_FRAME_POINTER=y
  # CONFIG_FAULT_INJECTION is not set
  # CONFIG_LATENCYTOP is not set
  CONFIG_SYSCTL_SYSCALL_CHECK=y
+# CONFIG_DEBUG_PAGEALLOC is not set
  CONFIG_USER_STACKTRACE_SUPPORT=y
+CONFIG_NOP_TRACER=y
  CONFIG_HAVE_FUNCTION_TRACER=y
  CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
  CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
  CONFIG_HAVE_DYNAMIC_FTRACE=y
  CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
  CONFIG_HAVE_HW_BRANCH_TRACER=y
+CONFIG_HAVE_FTRACE_SYSCALLS=y
+CONFIG_RING_BUFFER=y
+CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
  
  #
  # Tracers
@@ -2285,13 +2342,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
  # CONFIG_SYSPROF_TRACER is not set
  # CONFIG_SCHED_TRACER is not set
  # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_FTRACE_SYSCALLS is not set
  # CONFIG_BOOT_TRACER is not set
  # CONFIG_TRACE_BRANCH_PROFILING is not set
  # CONFIG_POWER_TRACER is not set
  # CONFIG_STACK_TRACER is not set
  # CONFIG_HW_BRANCH_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+CONFIG_BLK_DEV_IO_TRACE=y
+# CONFIG_FTRACE_STARTUP_TEST is not set
+# CONFIG_MMIOTRACE is not set
  CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
-# CONFIG_DYNAMIC_PRINTK_DEBUG is not set
+# CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
  # CONFIG_SAMPLES is not set
  CONFIG_HAVE_ARCH_KGDB=y
  # CONFIG_KGDB is not set
@@ -2301,14 +2366,13 @@ CONFIG_EARLY_PRINTK=y
  CONFIG_EARLY_PRINTK_DBGP=y
  CONFIG_DEBUG_STACKOVERFLOW=y
  CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_DEBUG_PAGEALLOC is not set
  # CONFIG_DEBUG_PER_CPU_MAPS is not set
  # CONFIG_X86_PTDUMP is not set
  CONFIG_DEBUG_RODATA=y
  # CONFIG_DEBUG_RODATA_TEST is not set
  CONFIG_DEBUG_NX_TEST=m
  # CONFIG_IOMMU_DEBUG is not set
-# CONFIG_MMIOTRACE is not set
+CONFIG_HAVE_MMIOTRACE_SUPPORT=y
  CONFIG_IO_DELAY_TYPE_0X80=0
  CONFIG_IO_DELAY_TYPE_0XED=1
  CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2344,6 +2408,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
  CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
  # CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
  # CONFIG_SECURITY_SMACK is not set
+# CONFIG_SECURITY_TOMOYO is not set
+# CONFIG_IMA is not set
  CONFIG_CRYPTO=y
  
  #
@@ -2359,10 +2425,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
  CONFIG_CRYPTO_HASH=y
  CONFIG_CRYPTO_HASH2=y
  CONFIG_CRYPTO_RNG2=y
+CONFIG_CRYPTO_PCOMP=y
  CONFIG_CRYPTO_MANAGER=y
  CONFIG_CRYPTO_MANAGER2=y
  # CONFIG_CRYPTO_GF128MUL is not set
  # CONFIG_CRYPTO_NULL is not set
+CONFIG_CRYPTO_WORKQUEUE=y
  # CONFIG_CRYPTO_CRYPTD is not set
  CONFIG_CRYPTO_AUTHENC=y
  # CONFIG_CRYPTO_TEST is not set
@@ -2414,6 +2482,7 @@ CONFIG_CRYPTO_SHA1=y
  #
  CONFIG_CRYPTO_AES=y
  # CONFIG_CRYPTO_AES_X86_64 is not set
+# CONFIG_CRYPTO_AES_NI_INTEL is not set
  # CONFIG_CRYPTO_ANUBIS is not set
  CONFIG_CRYPTO_ARC4=y
  # CONFIG_CRYPTO_BLOWFISH is not set
@@ -2435,6 +2504,7 @@ CONFIG_CRYPTO_DES=y
  # Compression
  #
  # CONFIG_CRYPTO_DEFLATE is not set
+# CONFIG_CRYPTO_ZLIB is not set
  # CONFIG_CRYPTO_LZO is not set
  
  #
@@ -2444,10 +2514,12 @@ CONFIG_CRYPTO_DES=y
  CONFIG_CRYPTO_HW=y
  # CONFIG_CRYPTO_DEV_HIFN_795X is not set
  CONFIG_HAVE_KVM=y
+CONFIG_HAVE_KVM_IRQCHIP=y
  CONFIG_VIRTUALIZATION=y
  # CONFIG_KVM is not set
  # CONFIG_VIRTIO_PCI is not set
  # CONFIG_VIRTIO_BALLOON is not set
+CONFIG_BINARY_PRINTF=y
  
  #
  # Library routines
@@ -2464,7 +2536,10 @@ CONFIG_CRC32=y
  # CONFIG_CRC7 is not set
  # CONFIG_LIBCRC32C is not set
  CONFIG_ZLIB_INFLATE=y
-CONFIG_PLIST=y
+CONFIG_DECOMPRESS_GZIP=y
+CONFIG_DECOMPRESS_BZIP2=y
+CONFIG_DECOMPRESS_LZMA=y
  CONFIG_HAS_IOMEM=y
  CONFIG_HAS_IOPORT=y
  CONFIG_HAS_DMA=y
+CONFIG_NLATTR=y
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S

index a505202086e8741a916e696ccfb664d3f3e0f8ef..e590261ba059871def263d43e7c174aee25d5753 100644 (file)
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -825,9 +825,11 @@ ia32_sys_call_table:
         .quad compat_sys_signalfd4
         .quad sys_eventfd2
         .quad sys_epoll_create1
-       .quad sys_dup3                  /* 330 */
+       .quad sys_dup3                          /* 330 */
         .quad sys_pipe2
         .quad sys_inotify_init1
         .quad compat_sys_preadv
         .quad compat_sys_pwritev
+       .quad compat_sys_rt_tgsigqueueinfo      /* 335 */
+       .quad sys_perf_counter_open
  ia32_syscall_end:
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h

index f6aa18eadf71717d9e86c53ac3719776fa035969..1a37bcdc8606c0ccefaa73026c2188a8235b1a24 100644 (file)
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -3,6 +3,7 @@
  
  #include <linux/types.h>
  #include <linux/stddef.h>
+#include <linux/stringify.h>
  #include <asm/asm.h>
  
  /*
@@ -74,6 +75,22 @@ static inline void alternatives_smp_switch(int smp) {}
  
  const unsigned char *const *find_nop_table(void);
  
+/* alternative assembly primitive: */
+#define ALTERNATIVE(oldinstr, newinstr, feature)                       \
+                                                                       \
+      "661:\n\t" oldinstr "\n662:\n"                                   \
+      ".section .altinstructions,\"a\"\n"                              \
+      _ASM_ALIGN "\n"                                                  \
+      _ASM_PTR "661b\n"                                /* label           */   \
+      _ASM_PTR "663f\n"                                /* new instruction */   \
+      "         .byte " __stringify(feature) "\n"      /* feature bit     */   \
+      "         .byte 662b-661b\n"                     /* sourcelen       */   \
+      "         .byte 664f-663f\n"                     /* replacementlen  */   \
+      ".previous\n"                                                    \
+      ".section .altinstr_replacement, \"ax\"\n"                       \
+      "663:\n\t" newinstr "\n664:\n"           /* replacement     */   \
+      ".previous"
+
  /*
   * Alternative instructions for different CPU types or capabilities.
   *
@@ -87,18 +104,7 @@ const unsigned char *const *find_nop_table(void);
   * without volatile and memory clobber.
   */
  #define alternative(oldinstr, newinstr, feature)                       \
-       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
-                     ".section .altinstructions,\"a\"\n"               \
-                     _ASM_ALIGN "\n"                                   \
-                     _ASM_PTR "661b\n"         /* label */             \
-                     _ASM_PTR "663f\n"         /* new instruction */   \
-                     "  .byte %c0\n"           /* feature bit */       \
-                     "  .byte 662b-661b\n"     /* sourcelen */         \
-                     "  .byte 664f-663f\n"     /* replacementlen */    \
-                     ".previous\n"                                     \
-                     ".section .altinstr_replacement,\"ax\"\n"         \
-                     "663:\n\t" newinstr "\n664:\n"  /* replacement */ \
-                     ".previous" :: "i" (feature) : "memory")
+       asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
  
  /*
   * Alternative inline assembly with input.
@@ -109,35 +115,16 @@ const unsigned char *const *find_nop_table(void);
   * Best is to use constraints that are fixed size (like (%1) ... "r")
   * If you use variable sized constraints like "m" or "g" in the
   * replacement make sure to pad to the worst case length.
+ * Leaving an unused argument 0 to keep API compatibility.
   */
  #define alternative_input(oldinstr, newinstr, feature, input...)       \
-       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
-                     ".section .altinstructions,\"a\"\n"               \
-                     _ASM_ALIGN "\n"                                   \
-                     _ASM_PTR "661b\n"         /* label */             \
-                     _ASM_PTR "663f\n"         /* new instruction */   \
-                     "  .byte %c0\n"           /* feature bit */       \
-                     "  .byte 662b-661b\n"     /* sourcelen */         \
-                     "  .byte 664f-663f\n"     /* replacementlen */    \
-                     ".previous\n"                                     \
-                     ".section .altinstr_replacement,\"ax\"\n"         \
-                     "663:\n\t" newinstr "\n664:\n"  /* replacement */ \
-                     ".previous" :: "i" (feature), ##input)
+       asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)          \
+               : : "i" (0), ## input)
  
  /* Like alternative_input, but with a single output argument */
  #define alternative_io(oldinstr, newinstr, feature, output, input...)  \
-       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
-                     ".section .altinstructions,\"a\"\n"               \
-                     _ASM_ALIGN "\n"                                   \
-                     _ASM_PTR "661b\n"         /* label */             \
-                     _ASM_PTR "663f\n"         /* new instruction */   \
-                     "  .byte %c[feat]\n"      /* feature bit */       \
-                     "  .byte 662b-661b\n"     /* sourcelen */         \
-                     "  .byte 664f-663f\n"     /* replacementlen */    \
-                     ".previous\n"                                     \
-                     ".section .altinstr_replacement,\"ax\"\n"         \
-                     "663:\n\t" newinstr "\n664:\n"  /* replacement */ \
-                     ".previous" : output : [feat] "i" (feature), ##input)
+       asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)          \
+               : output : "i" (0), ## input)
  
  /*
   * use this macro(s) if you need more than one output parameter
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h

index f712344329bc7a6929a38f8e0e968bd1166e4c9e..262e02820049aa8cc92dca18e83afdcbb829a71c 100644 (file)
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -27,6 +27,8 @@ extern int amd_iommu_init(void);
  extern int amd_iommu_init_dma_ops(void);
  extern void amd_iommu_detect(void);
  extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
+extern void amd_iommu_flush_all_domains(void);
+extern void amd_iommu_flush_all_devices(void);
  #else
  static inline int amd_iommu_init(void) { return -ENODEV; }
  static inline void amd_iommu_detect(void) { }
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h

index 95c8cd9d22b54401fc8de9cf734512126b34a05f..0c878caaa0a23cda2a83a0ab9bb9ef3620bf8fad 100644 (file)
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -194,6 +194,27 @@
  #define PD_DMA_OPS_MASK                (1UL << 0) /* domain used for dma_ops */
  #define PD_DEFAULT_MASK                (1UL << 1) /* domain is a default dma_ops
                                               domain for an IOMMU */
+extern bool amd_iommu_dump;
+#define DUMP_printk(format, arg...)                                    \
+       do {                                                            \
+               if (amd_iommu_dump)                                             \
+                       printk(KERN_INFO "AMD IOMMU: " format, ## arg); \
+       } while(0);
+
+/*
+ * Make iterating over all IOMMUs easier
+ */
+#define for_each_iommu(iommu) \
+       list_for_each_entry((iommu), &amd_iommu_list, list)
+#define for_each_iommu_safe(iommu, next) \
+       list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
+
+#define APERTURE_RANGE_SHIFT   27      /* 128 MB */
+#define APERTURE_RANGE_SIZE    (1ULL << APERTURE_RANGE_SHIFT)
+#define APERTURE_RANGE_PAGES   (APERTURE_RANGE_SIZE >> PAGE_SHIFT)
+#define APERTURE_MAX_RANGES    32      /* allows 4GB of DMA address space */
+#define APERTURE_RANGE_INDEX(a)        ((a) >> APERTURE_RANGE_SHIFT)
+#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL)
  
  /*
   * This structure contains generic data for  IOMMU protection domains
@@ -209,6 +230,26 @@ struct protection_domain {
         void *priv;             /* private data */
  };
  
+/*
+ * For dynamic growth the aperture size is split into ranges of 128MB of
+ * DMA address space each. This struct represents one such range.
+ */
+struct aperture_range {
+
+       /* address allocation bitmap */
+       unsigned long *bitmap;
+
+       /*
+        * Array of PTE pages for the aperture. In this array we save all the
+        * leaf pages of the domain page table used for the aperture. This way
+        * we don't need to walk the page table to find a specific PTE. We can
+        * just calculate its address in constant time.
+        */
+       u64 *pte_pages[64];
+
+       unsigned long offset;
+};
+
  /*
   * Data container for a dma_ops specific protection domain
   */
@@ -222,18 +263,10 @@ struct dma_ops_domain {
         unsigned long aperture_size;
  
         /* address we start to search for free addresses */
-       unsigned long next_bit;
-
-       /* address allocation bitmap */
-       unsigned long *bitmap;
+       unsigned long next_address;
  
-       /*
-        * Array of PTE pages for the aperture. In this array we save all the
-        * leaf pages of the domain page table used for the aperture. This way
-        * we don't need to walk the page table to find a specific PTE. We can
-        * just calculate its address in constant time.
-        */
-       u64 **pte_pages;
+       /* address space relevant data */
+       struct aperture_range *aperture[APERTURE_MAX_RANGES];
  
         /* This will be set to true when TLB needs to be flushed */
         bool need_flush;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h

index 42f2f83774224fb7897bf848fc6b84c4bce87380..bb7d47925847c585ee7e9e66f6f2e8652b5ab753 100644 (file)
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -107,8 +107,7 @@ extern u32 native_safe_apic_wait_icr_idle(void);
  extern void native_apic_icr_write(u32 low, u32 id);
  extern u64 native_apic_icr_read(void);
  
-#define EIM_8BIT_APIC_ID       0
-#define EIM_32BIT_APIC_ID      1
+extern int x2apic_mode;
  
  #ifdef CONFIG_X86_X2APIC
  /*
@@ -166,10 +165,9 @@ static inline u64 native_x2apic_icr_read(void)
         return val;
  }
  
-extern int x2apic, x2apic_phys;
+extern int x2apic_phys;
  extern void check_x2apic(void);
  extern void enable_x2apic(void);
-extern void enable_IR_x2apic(void);
  extern void x2apic_icr_write(u32 low, u32 id);
  static inline int x2apic_enabled(void)
  {
@@ -183,6 +181,8 @@ static inline int x2apic_enabled(void)
                 return 1;
         return 0;
  }
+
+#define x2apic_supported()     (cpu_has_x2apic)
  #else
  static inline void check_x2apic(void)
  {
@@ -190,28 +190,20 @@ static inline void check_x2apic(void)
  static inline void enable_x2apic(void)
  {
  }
-static inline void enable_IR_x2apic(void)
-{
-}
  static inline int x2apic_enabled(void)
  {
         return 0;
  }
  
-#define        x2apic  0
-
+#define        x2apic_preenabled 0
+#define        x2apic_supported()      0
  #endif
  
-extern int get_physical_broadcast(void);
+extern void enable_IR_x2apic(void);
  
-#ifdef CONFIG_X86_X2APIC
-static inline void ack_x2APIC_irq(void)
-{
-       /* Docs say use 0 for future compatibility */
-       native_apic_msr_write(APIC_EOI, 0);
-}
-#endif
+extern int get_physical_broadcast(void);
  
+extern void apic_disable(void);
  extern int lapic_get_maxlvt(void);
  extern void clear_local_APIC(void);
  extern void connect_bsp_APIC(void);
@@ -252,7 +244,7 @@ static inline void lapic_shutdown(void) { }
  #define local_apic_timer_c2_ok         1
  static inline void init_apic_mappings(void) { }
  static inline void disable_local_APIC(void) { }
-
+static inline void apic_disable(void) { }
  #endif /* !CONFIG_X86_LOCAL_APIC */
  
  #ifdef CONFIG_X86_64
@@ -410,7 +402,7 @@ static inline unsigned default_get_apic_id(unsigned long x)
  {
         unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
  
-       if (APIC_XAPIC(ver))
+       if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
                 return (x >> 24) & 0xFF;
         else
                 return (x >> 24) & 0x0F;
@@ -478,6 +470,9 @@ static inline unsigned int read_apic_id(void)
  extern void default_setup_apic_routing(void);
  
  #ifdef CONFIG_X86_32
+
+extern struct apic apic_default;
+
  /*
   * Set up the logical destination ID.
   *
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h

index bc9514fb3b13f70d75b11f037546eca8e43d07fd..7ddb36ab933b97b7757c0df30732d31d40404c19 100644 (file)
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -22,6 +22,7 @@
  #  define      APIC_INTEGRATED(x)      (1)
  #endif
  #define                APIC_XAPIC(x)           ((x) >= 0x14)
+#define                APIC_EXT_SPACE(x)       ((x) & 0x80000000)
  #define        APIC_TASKPRI    0x80
  #define                APIC_TPRI_MASK          0xFFu
  #define        APIC_ARBPRI     0x90
@@ -116,7 +117,9 @@
  #define                APIC_TDR_DIV_32         0x8
  #define                APIC_TDR_DIV_64         0x9
  #define                APIC_TDR_DIV_128        0xA
-#define        APIC_EILVT0     0x500
+#define        APIC_EFEAT      0x400
+#define        APIC_ECTRL      0x410
+#define APIC_EILVTn(n) (0x500 + 0x10 * n)
  #define                APIC_EILVT_NR_AMD_K8    1       /* # of extended interrupts */
  #define                APIC_EILVT_NR_AMD_10H   4
  #define                APIC_EILVT_LVTOFF(x)    (((x) >> 4) & 0xF)
@@ -125,9 +128,6 @@
  #define                APIC_EILVT_MSG_NMI      0x4
  #define                APIC_EILVT_MSG_EXT      0x7
  #define                APIC_EILVT_MASKED       (1 << 16)
-#define        APIC_EILVT1     0x510
-#define        APIC_EILVT2     0x520
-#define        APIC_EILVT3     0x530
  
  #define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
  #define APIC_BASE_MSR  0x800
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h

index 85b46fba4229cc0334e05d5d9c5e66deff3fa69f..aff9f1fcdcd7e2be9d86049fc7f9522bafd4b35d 100644 (file)
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -247,5 +247,241 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
  #define smp_mb__before_atomic_inc()    barrier()
  #define smp_mb__after_atomic_inc()     barrier()
  
+/* An 64bit atomic type */
+
+typedef struct {
+       unsigned long long counter;
+} atomic64_t;
+
+#define ATOMIC64_INIT(val)     { (val) }
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically reads the value of @v.
+ * Doesn't imply a read memory barrier.
+ */
+#define __atomic64_read(ptr)           ((ptr)->counter)
+
+static inline unsigned long long
+cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
+{
+       asm volatile(
+
+               LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
+
+                    :          "=A" (old)
+
+                    : [ptr]    "D" (ptr),
+                               "A" (old),
+                               "b" (ll_low(new)),
+                               "c" (ll_high(new))
+
+                    : "memory");
+
+       return old;
+}
+
+static inline unsigned long long
+atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
+                unsigned long long new_val)
+{
+       return cmpxchg8b(&ptr->counter, old_val, new_val);
+}
+
+/**
+ * atomic64_xchg - xchg atomic64 variable
+ * @ptr:      pointer to type atomic64_t
+ * @new_val:  value to assign
+ * @old_val:  old value that was there
+ *
+ * Atomically xchgs the value of @ptr to @new_val and returns
+ * the old value.
+ */
+
+static inline unsigned long long
+atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
+{
+       unsigned long long old_val;
+
+       do {
+               old_val = atomic_read(ptr);
+       } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+
+       return old_val;
+}
+
+/**
+ * atomic64_set - set atomic64 variable
+ * @ptr:      pointer to type atomic64_t
+ * @new_val:  value to assign
+ *
+ * Atomically sets the value of @ptr to @new_val.
+ */
+static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
+{
+       atomic64_xchg(ptr, new_val);
+}
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @ptr:      pointer to type atomic64_t
+ *
+ * Atomically reads the value of @ptr and returns it.
+ */
+static inline unsigned long long atomic64_read(atomic64_t *ptr)
+{
+       unsigned long long curr_val;
+
+       do {
+               curr_val = __atomic64_read(ptr);
+       } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
+
+       return curr_val;
+}
+
+/**
+ * atomic64_add_return - add and return
+ * @delta: integer value to add
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns @delta + *@ptr
+ */
+static inline unsigned long long
+atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
+{
+       unsigned long long old_val, new_val;
+
+       do {
+               old_val = atomic_read(ptr);
+               new_val = old_val + delta;
+
+       } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+
+       return new_val;
+}
+
+static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
+{
+       return atomic64_add_return(-delta, ptr);
+}
+
+static inline long atomic64_inc_return(atomic64_t *ptr)
+{
+       return atomic64_add_return(1, ptr);
+}
+
+static inline long atomic64_dec_return(atomic64_t *ptr)
+{
+       return atomic64_sub_return(1, ptr);
+}
+
+/**
+ * atomic64_add - add integer to atomic64 variable
+ * @delta: integer value to add
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr.
+ */
+static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
+{
+       atomic64_add_return(delta, ptr);
+}
+
+/**
+ * atomic64_sub - subtract the atomic64 variable
+ * @delta: integer value to subtract
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr.
+ */
+static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
+{
+       atomic64_add(-delta, ptr);
+}
+
+/**
+ * atomic64_sub_and_test - subtract value from variable and test result
+ * @delta: integer value to subtract
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int
+atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
+{
+       unsigned long long old_val = atomic64_sub_return(delta, ptr);
+
+       return old_val == 0;
+}
+
+/**
+ * atomic64_inc - increment atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1.
+ */
+static inline void atomic64_inc(atomic64_t *ptr)
+{
+       atomic64_add(1, ptr);
+}
+
+/**
+ * atomic64_dec - decrement atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1.
+ */
+static inline void atomic64_dec(atomic64_t *ptr)
+{
+       atomic64_sub(1, ptr);
+}
+
+/**
+ * atomic64_dec_and_test - decrement and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic64_dec_and_test(atomic64_t *ptr)
+{
+       return atomic64_sub_and_test(1, ptr);
+}
+
+/**
+ * atomic64_inc_and_test - increment and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic64_inc_and_test(atomic64_t *ptr)
+{
+       return atomic64_sub_and_test(-1, ptr);
+}
+
+/**
+ * atomic64_add_negative - add and test if negative
+ * @delta: integer value to add
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline int
+atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
+{
+       long long old_val = atomic64_add_return(delta, ptr);
+
+       return old_val < 0;
+}
+
  #include <asm-generic/atomic.h>
  #endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h

index 6ba23dd9fc9216de96b9cb52d1954b9bb7ae2b99..418e632d4a801aa8e86f565afeae59659d6500a0 100644 (file)
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -8,11 +8,26 @@
  
  #ifdef __KERNEL__
  
+#include <asm/page_types.h>
+
  /* Physical address where kernel should be loaded. */
  #define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
                                 + (CONFIG_PHYSICAL_ALIGN - 1)) \
                                 & ~(CONFIG_PHYSICAL_ALIGN - 1))
  
+/* Minimum kernel alignment, as a power of two */
+#ifdef CONFIG_x86_64
+#define MIN_KERNEL_ALIGN_LG2   PMD_SHIFT
+#else
+#define MIN_KERNEL_ALIGN_LG2   (PAGE_SHIFT+1)
+#endif
+#define MIN_KERNEL_ALIGN       (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
+
+#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
+       (CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2))
+#error "Invalid value for CONFIG_PHYSICAL_ALIGN"
+#endif
+
  #ifdef CONFIG_KERNEL_BZIP2
  #define BOOT_HEAP_SIZE             0x400000
  #else /* !CONFIG_KERNEL_BZIP2 */
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h

index 433adaebf9b631f108399a3d27f660ff61a0369b..1724e8de317c59518daebdb305f75b23bb4fcaf6 100644 (file)
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -50,7 +50,8 @@ struct setup_header {
         __u32   ramdisk_size;
         __u32   bootsect_kludge;
         __u16   heap_end_ptr;
-       __u16   _pad1;
+       __u8    ext_loader_ver;
+       __u8    ext_loader_type;
         __u32   cmd_line_ptr;
         __u32   initrd_addr_max;
         __u32   kernel_alignment;
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h

index 222802029fa6c43b504959401236d559991deba0..d96c1ee3a95cce1ec7fdf48a1fe5269bba58d32f 100644 (file)
--- a/arch/x86/include/asm/cpu_debug.h
+++ b/arch/x86/include/asm/cpu_debug.h
@@ -86,105 +86,7 @@ enum cpu_file_bit {
         CPU_VALUE_BIT,                          /* value                */
  };
  
-#define        CPU_FILE_VALUE                  (1 << CPU_VALUE_BIT)
-
-/*
- * DisplayFamily_DisplayModel  Processor Families/Processor Number Series
- * --------------------------  ------------------------------------------
- * 05_01, 05_02, 05_04         Pentium, Pentium with MMX
- *
- * 06_01                       Pentium Pro
- * 06_03, 06_05                        Pentium II Xeon, Pentium II
- * 06_07, 06_08, 06_0A, 06_0B  Pentium III Xeon, Pentum III
- *
- * 06_09, 060D                 Pentium M
- *
- * 06_0E                       Core Duo, Core Solo
- *
- * 06_0F                       Xeon 3000, 3200, 5100, 5300, 7300 series,
- *                             Core 2 Quad, Core 2 Extreme, Core 2 Duo,
- *                             Pentium dual-core
- * 06_17                       Xeon 5200, 5400 series, Core 2 Quad Q9650
- *
- * 06_1C                       Atom
- *
- * 0F_00, 0F_01, 0F_02         Xeon, Xeon MP, Pentium 4
- * 0F_03, 0F_04                        Xeon, Xeon MP, Pentium 4, Pentium D
- *
- * 0F_06                       Xeon 7100, 5000 Series, Xeon MP,
- *                             Pentium 4, Pentium D
- */
-
-/* Register processors bits */
-enum cpu_processor_bit {
-       CPU_NONE,
-/* Intel */
-       CPU_INTEL_PENTIUM_BIT,
-       CPU_INTEL_P6_BIT,
-       CPU_INTEL_PENTIUM_M_BIT,
-       CPU_INTEL_CORE_BIT,
-       CPU_INTEL_CORE2_BIT,
-       CPU_INTEL_ATOM_BIT,
-       CPU_INTEL_XEON_P4_BIT,
-       CPU_INTEL_XEON_MP_BIT,
-/* AMD */
-       CPU_AMD_K6_BIT,
-       CPU_AMD_K7_BIT,
-       CPU_AMD_K8_BIT,
-       CPU_AMD_0F_BIT,
-       CPU_AMD_10_BIT,
-       CPU_AMD_11_BIT,
-};
-
-#define        CPU_INTEL_PENTIUM       (1 << CPU_INTEL_PENTIUM_BIT)
-#define        CPU_INTEL_P6            (1 << CPU_INTEL_P6_BIT)
-#define        CPU_INTEL_PENTIUM_M     (1 << CPU_INTEL_PENTIUM_M_BIT)
-#define        CPU_INTEL_CORE          (1 << CPU_INTEL_CORE_BIT)
-#define        CPU_INTEL_CORE2         (1 << CPU_INTEL_CORE2_BIT)
-#define        CPU_INTEL_ATOM          (1 << CPU_INTEL_ATOM_BIT)
-#define        CPU_INTEL_XEON_P4       (1 << CPU_INTEL_XEON_P4_BIT)
-#define        CPU_INTEL_XEON_MP       (1 << CPU_INTEL_XEON_MP_BIT)
-
-#define        CPU_INTEL_PX            (CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M)
-#define        CPU_INTEL_COREX         (CPU_INTEL_CORE | CPU_INTEL_CORE2)
-#define        CPU_INTEL_XEON          (CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP)
-#define        CPU_CO_AT               (CPU_INTEL_CORE | CPU_INTEL_ATOM)
-#define        CPU_C2_AT               (CPU_INTEL_CORE2 | CPU_INTEL_ATOM)
-#define        CPU_CX_AT               (CPU_INTEL_COREX | CPU_INTEL_ATOM)
-#define        CPU_CX_XE               (CPU_INTEL_COREX | CPU_INTEL_XEON)
-#define        CPU_P6_XE               (CPU_INTEL_P6 | CPU_INTEL_XEON)
-#define        CPU_PM_CO_AT            (CPU_INTEL_PENTIUM_M | CPU_CO_AT)
-#define        CPU_C2_AT_XE            (CPU_C2_AT | CPU_INTEL_XEON)
-#define        CPU_CX_AT_XE            (CPU_CX_AT | CPU_INTEL_XEON)
-#define        CPU_P6_CX_AT            (CPU_INTEL_P6 | CPU_CX_AT)
-#define        CPU_P6_CX_XE            (CPU_P6_XE | CPU_INTEL_COREX)
-#define        CPU_P6_CX_AT_XE         (CPU_INTEL_P6 | CPU_CX_AT_XE)
-#define        CPU_PM_CX_AT_XE         (CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE)
-#define        CPU_PM_CX_AT            (CPU_INTEL_PENTIUM_M | CPU_CX_AT)
-#define        CPU_PM_CX_XE            (CPU_INTEL_PENTIUM_M | CPU_CX_XE)
-#define        CPU_PX_CX_AT            (CPU_INTEL_PX | CPU_CX_AT)
-#define        CPU_PX_CX_AT_XE         (CPU_INTEL_PX | CPU_CX_AT_XE)
-
-/* Select all supported Intel CPUs */
-#define        CPU_INTEL_ALL           (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE)
-
-#define        CPU_AMD_K6              (1 << CPU_AMD_K6_BIT)
-#define        CPU_AMD_K7              (1 << CPU_AMD_K7_BIT)
-#define        CPU_AMD_K8              (1 << CPU_AMD_K8_BIT)
-#define        CPU_AMD_0F              (1 << CPU_AMD_0F_BIT)
-#define        CPU_AMD_10              (1 << CPU_AMD_10_BIT)
-#define        CPU_AMD_11              (1 << CPU_AMD_11_BIT)
-
-#define        CPU_K10_PLUS            (CPU_AMD_10 | CPU_AMD_11)
-#define        CPU_K0F_PLUS            (CPU_AMD_0F | CPU_K10_PLUS)
-#define        CPU_K8_PLUS             (CPU_AMD_K8 | CPU_K0F_PLUS)
-#define        CPU_K7_PLUS             (CPU_AMD_K7 | CPU_K8_PLUS)
-
-/* Select all supported AMD CPUs */
-#define        CPU_AMD_ALL             (CPU_AMD_K6 | CPU_K7_PLUS)
-
-/* Select all supported CPUs */
-#define        CPU_ALL                 (CPU_INTEL_ALL | CPU_AMD_ALL)
+#define        CPU_FILE_VALUE          (1 << CPU_VALUE_BIT)
  
  #define MAX_CPU_FILES          512
  
@@ -220,7 +122,6 @@ struct cpu_debug_range {
         unsigned                min;            /* Register range min   */
         unsigned                max;            /* Register range max   */
         unsigned                flag;           /* Supported flags      */
-       unsigned                model;          /* Supported models     */
  };
  
  #endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h

index bb83b1c397aad8d05251db9dcd71880b1967b1f4..19af42138f78ee8a95509276fc70178475fb95c0 100644 (file)
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -22,7 +22,7 @@
  #define X86_FEATURE_TSC                (0*32+ 4) /* Time Stamp Counter */
  #define X86_FEATURE_MSR                (0*32+ 5) /* Model-Specific Registers */
  #define X86_FEATURE_PAE                (0*32+ 6) /* Physical Address Extensions */
-#define X86_FEATURE_MCE                (0*32+ 7) /* Machine Check Architecture */
+#define X86_FEATURE_MCE                (0*32+ 7) /* Machine Check Exception */
  #define X86_FEATURE_CX8                (0*32+ 8) /* CMPXCHG8 instruction */
  #define X86_FEATURE_APIC       (0*32+ 9) /* Onboard APIC */
  #define X86_FEATURE_SEP                (0*32+11) /* SYSENTER/SYSEXIT */
@@ -94,6 +94,7 @@
  #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
  #define X86_FEATURE_NONSTOP_TSC        (3*32+24) /* TSC does not stop in C states */
  #define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
+#define X86_FEATURE_EXTD_APICID        (3*32+26) /* has extended APICID (8 bits) */
  
  /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
  #define X86_FEATURE_XMM3       (4*32+ 0) /* "pni" SSE-3 */
@@ -192,11 +193,11 @@ extern const char * const x86_power_flags[32];
  #define clear_cpu_cap(c, bit)  clear_bit(bit, (unsigned long *)((c)->x86_capability))
  #define setup_clear_cpu_cap(bit) do { \
         clear_cpu_cap(&boot_cpu_data, bit);     \
-       set_bit(bit, (unsigned long *)cleared_cpu_caps); \
+       set_bit(bit, (unsigned long *)cpu_caps_cleared); \
  } while (0)
  #define setup_force_cpu_cap(bit) do { \
         set_cpu_cap(&boot_cpu_data, bit);       \
-       clear_bit(bit, (unsigned long *)cleared_cpu_caps);      \
+       set_bit(bit, (unsigned long *)cpu_caps_set);    \
  } while (0)
  
  #define cpu_has_fpu            boot_cpu_has(X86_FEATURE_FPU)
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h

index a8f672ba100c541fa893c4167dffabfab1c97561..70dac199b093d6727db43ac652aa92c26e41eb20 100644 (file)
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -15,8 +15,8 @@
   * - buffer allocation (memory accounting)
   *
   *
- * Copyright (C) 2007-2008 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
+ * Copyright (C) 2007-2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
   */
  
  #ifndef _ASM_X86_DS_H
@@ -83,8 +83,10 @@ enum ds_feature {
   * The interrupt threshold is independent from the overflow callback
   * to allow users to use their own overflow interrupt handling mechanism.
   *
- * task: the task to request recording for;
- *       NULL for per-cpu recording on the current cpu
+ * The function might sleep.
+ *
+ * task: the task to request recording for
+ * cpu:  the cpu to request recording for
   * base: the base pointer for the (non-pageable) buffer;
   * size: the size of the provided buffer in bytes
   * ovfl: pointer to a function to be called on buffer overflow;
@@ -93,19 +95,28 @@ enum ds_feature {
   *     -1 if no interrupt threshold is requested.
   * flags: a bit-mask of the above flags
   */
-extern struct bts_tracer *ds_request_bts(struct task_struct *task,
-                                        void *base, size_t size,
-                                        bts_ovfl_callback_t ovfl,
-                                        size_t th, unsigned int flags);
-extern struct pebs_tracer *ds_request_pebs(struct task_struct *task,
-                                          void *base, size_t size,
-                                          pebs_ovfl_callback_t ovfl,
-                                          size_t th, unsigned int flags);
+extern struct bts_tracer *ds_request_bts_task(struct task_struct *task,
+                                             void *base, size_t size,
+                                             bts_ovfl_callback_t ovfl,
+                                             size_t th, unsigned int flags);
+extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
+                                            bts_ovfl_callback_t ovfl,
+                                            size_t th, unsigned int flags);
+extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
+                                               void *base, size_t size,
+                                               pebs_ovfl_callback_t ovfl,
+                                               size_t th, unsigned int flags);
+extern struct pebs_tracer *ds_request_pebs_cpu(int cpu,
+                                              void *base, size_t size,
+                                              pebs_ovfl_callback_t ovfl,
+                                              size_t th, unsigned int flags);
  
  /*
   * Release BTS or PEBS resources
   * Suspend and resume BTS or PEBS tracing
   *
+ * Must be called with irq's enabled.
+ *
   * tracer: the tracer handle returned from ds_request_~()
   */
  extern void ds_release_bts(struct bts_tracer *tracer);
@@ -115,6 +126,28 @@ extern void ds_release_pebs(struct pebs_tracer *tracer);
  extern void ds_suspend_pebs(struct pebs_tracer *tracer);
  extern void ds_resume_pebs(struct pebs_tracer *tracer);
  
+/*
+ * Release BTS or PEBS resources
+ * Suspend and resume BTS or PEBS tracing
+ *
+ * Cpu tracers must call this on the traced cpu.
+ * Task tracers must call ds_release_~_noirq() for themselves.
+ *
+ * May be called with irq's disabled.
+ *
+ * Returns 0 if successful;
+ * -EPERM if the cpu tracer does not trace the current cpu.
+ * -EPERM if the task tracer does not trace itself.
+ *
+ * tracer: the tracer handle returned from ds_request_~()
+ */
+extern int ds_release_bts_noirq(struct bts_tracer *tracer);
+extern int ds_suspend_bts_noirq(struct bts_tracer *tracer);
+extern int ds_resume_bts_noirq(struct bts_tracer *tracer);
+extern int ds_release_pebs_noirq(struct pebs_tracer *tracer);
+extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer);
+extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer);
+
  
  /*
   * The raw DS buffer state as it is used for BTS and PEBS recording.
@@ -170,9 +203,9 @@ struct bts_struct {
                 } lbr;
                 /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
                 struct {
-                       __u64 jiffies;
+                       __u64 clock;
                         pid_t pid;
-               } timestamp;
+               } event;
         } variant;
  };
  
@@ -201,8 +234,12 @@ struct bts_trace {
  struct pebs_trace {
         struct ds_trace ds;
  
-       /* the PEBS reset value */
-       unsigned long long reset_value;
+       /* the number of valid counters in the below array */
+       unsigned int counters;
+
+#define MAX_PEBS_COUNTERS 4
+       /* the counter reset value */
+       unsigned long long counter_reset[MAX_PEBS_COUNTERS];
  };
  
  
@@ -237,9 +274,11 @@ extern int ds_reset_pebs(struct pebs_tracer *tracer);
   * Returns 0 on success; -Eerrno on error
   *
   * tracer: the tracer handle returned from ds_request_pebs()
+ * counter: the index of the counter
   * value: the new counter reset value
   */
-extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);
+extern int ds_set_pebs_reset(struct pebs_tracer *tracer,
+                            unsigned int counter, u64 value);
  
  /*
   * Initialization
@@ -252,21 +291,12 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
   */
  extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
  
-/*
- * Task clone/init and cleanup work
- */
-extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father);
-extern void ds_exit_thread(struct task_struct *tsk);
-
  #else /* CONFIG_X86_DS */
  
  struct cpuinfo_x86;
  static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
  static inline void ds_switch_to(struct task_struct *prev,
                                 struct task_struct *next) {}
-static inline void ds_copy_thread(struct task_struct *tsk,
-                                 struct task_struct *father) {}
-static inline void ds_exit_thread(struct task_struct *tsk) {}
  
  #endif /* CONFIG_X86_DS */
  #endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h

index c2e6bedaf25873b75df79f6a0f809b4ad9070787..d750a10ccad663fe0e8cd5df1d5e5778e12e0d90 100644 (file)
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,7 +49,7 @@ BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
  BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
  
  #ifdef CONFIG_PERF_COUNTERS
-BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
+BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
  #endif
  
  #ifdef CONFIG_X86_MCE_P4THERMAL
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h

index 37555e52f980ac170e779dbe2aaa1e5e0c8ca07a..9ebc5c2550327ed26ab88e5647fbc09833194d07 100644 (file)
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -13,6 +13,8 @@ typedef struct {
         unsigned int irq_spurious_count;
  #endif
         unsigned int generic_irqs;      /* arch dependent */
+       unsigned int apic_perf_irqs;
+       unsigned int apic_pending_irqs;
  #ifdef CONFIG_SMP
         unsigned int irq_resched_count;
         unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h

index b762ea49bd703ab3b28958cf83b6908e825e57a4..6df45f639666f4be9ccb3792fc893fc48791871b 100644 (file)
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,6 +29,8 @@
  extern void apic_timer_interrupt(void);
  extern void generic_interrupt(void);
  extern void error_interrupt(void);
+extern void perf_pending_interrupt(void);
+
  extern void spurious_interrupt(void);
  extern void thermal_interrupt(void);
  extern void reschedule_interrupt(void);
@@ -63,7 +65,26 @@ extern unsigned long io_apic_irqs;
  extern void init_VISWS_APIC_irqs(void);
  extern void setup_IO_APIC(void);
  extern void disable_IO_APIC(void);
-extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
+
+struct io_apic_irq_attr {
+       int ioapic;
+       int ioapic_pin;
+       int trigger;
+       int polarity;
+};
+
+static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
+                                       int ioapic, int ioapic_pin,
+                                       int trigger, int polarity)
+{
+       irq_attr->ioapic     = ioapic;
+       irq_attr->ioapic_pin = ioapic_pin;
+       irq_attr->trigger    = trigger;
+       irq_attr->polarity   = polarity;
+}
+
+extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin,
+                                       struct io_apic_irq_attr *irq_attr);
  extern void setup_ioapic_dest(void);
  
  extern void enable_IO_APIC(void);
@@ -78,7 +99,11 @@ extern void eisa_set_level_irq(unsigned int irq);
  /* SMP */
  extern void smp_apic_timer_interrupt(struct pt_regs *);
  extern void smp_spurious_interrupt(struct pt_regs *);
+extern void smp_generic_interrupt(struct pt_regs *);
  extern void smp_error_interrupt(struct pt_regs *);
+#ifdef CONFIG_X86_IO_APIC
+extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
+#endif
  #ifdef CONFIG_SMP
  extern void smp_reschedule_interrupt(struct pt_regs *);
  extern void smp_call_function_interrupt(struct pt_regs *);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h

index 71c9e51839827dfc14ec28858467c2ea4815297c..175adf58dd4f8e3cec35d69123d4640af2ff37c8 100644 (file)
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -67,7 +67,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
                      ".previous\n"
                      _ASM_EXTABLE(1b, 3b)
                      : [err] "=r" (err)
-#if 0 /* See comment in __save_init_fpu() below. */
+#if 0 /* See comment in fxsave() below. */
                      : [fx] "r" (fx), "m" (*fx), "0" (0));
  #else
                      : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
@@ -75,14 +75,6 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
         return err;
  }
  
-static inline int restore_fpu_checking(struct task_struct *tsk)
-{
-       if (task_thread_info(tsk)->status & TS_XSAVE)
-               return xrstor_checking(&tsk->thread.xstate->xsave);
-       else
-               return fxrstor_checking(&tsk->thread.xstate->fxsave);
-}
-
  /* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
     is pending. Clear the x87 state here by setting it to fixed
     values. The kernel data segment can be sometimes 0 and sometimes
@@ -120,7 +112,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
                      ".previous\n"
                      _ASM_EXTABLE(1b, 3b)
                      : [err] "=r" (err), "=m" (*fx)
-#if 0 /* See comment in __fxsave_clear() below. */
+#if 0 /* See comment in fxsave() below. */
                      : [fx] "r" (fx), "0" (0));
  #else
                      : [fx] "cdaSDb" (fx), "0" (0));
@@ -185,12 +177,9 @@ static inline void tolerant_fwait(void)
         asm volatile("fnclex ; fwait");
  }
  
-static inline void restore_fpu(struct task_struct *tsk)
+/* perform fxrstor iff the processor has extended states, otherwise frstor */
+static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
  {
-       if (task_thread_info(tsk)->status & TS_XSAVE) {
-               xrstor_checking(&tsk->thread.xstate->xsave);
-               return;
-       }
         /*
          * The "nop" is needed to make the instructions the same
          * length.
@@ -199,7 +188,9 @@ static inline void restore_fpu(struct task_struct *tsk)
                 "nop ; frstor %1",
                 "fxrstor %1",
                 X86_FEATURE_FXSR,
-               "m" (tsk->thread.xstate->fxsave));
+               "m" (*fx));
+
+       return 0;
  }
  
  /* We need a safe address that is cheap to find and that is already
@@ -262,6 +253,14 @@ end:
  
  #endif /* CONFIG_X86_64 */
  
+static inline int restore_fpu_checking(struct task_struct *tsk)
+{
+       if (task_thread_info(tsk)->status & TS_XSAVE)
+               return xrstor_checking(&tsk->thread.xstate->xsave);
+       else
+               return fxrstor_checking(&tsk->thread.xstate->fxsave);
+}
+
  /*
   * Signal frame handlers...
   */
@@ -305,18 +304,18 @@ static inline void kernel_fpu_end(void)
  /*
   * Some instructions like VIA's padlock instructions generate a spurious
   * DNA fault but don't modify SSE registers. And these instructions
- * get used from interrupt context aswell. To prevent these kernel instructions
- * in interrupt context interact wrongly with other user/kernel fpu usage, we
+ * get used from interrupt context as well. To prevent these kernel instructions
+ * in interrupt context interacting wrongly with other user/kernel fpu usage, we
   * should use them only in the context of irq_ts_save/restore()
   */
  static inline int irq_ts_save(void)
  {
         /*
-        * If we are in process context, we are ok to take a spurious DNA fault.
-        * Otherwise, doing clts() in process context require pre-emption to
-        * be disabled or some heavy lifting like kernel_fpu_begin()
+        * If in process context and not atomic, we can take a spurious DNA fault.
+        * Otherwise, doing clts() in process context requires disabling preemption
+        * or some heavy lifting like kernel_fpu_begin()
          */
-       if (!in_interrupt())
+       if (!in_atomic())
                 return 0;
  
         if (read_cr0() & X86_CR0_TS) {
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h

index 1a99e6c092afcfaaad1a73b95ee4ecc7249f231b..58d7091eeb1fc1e8955c87246121060fbb851d31 100644 (file)
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -60,8 +60,4 @@ extern struct irq_chip i8259A_chip;
  extern void mask_8259A(void);
  extern void unmask_8259A(void);
  
-#ifdef CONFIG_X86_32
-extern void init_ISA_irqs(void);
-#endif
-
  #endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h

deleted file mode 100644 (file)

index fa0fd06..0000000
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
-#define _ASM_X86_INTEL_ARCH_PERFMON_H
-
-#define MSR_ARCH_PERFMON_PERFCTR0              0xc1
-#define MSR_ARCH_PERFMON_PERFCTR1              0xc2
-
-#define MSR_ARCH_PERFMON_EVENTSEL0             0x186
-#define MSR_ARCH_PERFMON_EVENTSEL1             0x187
-
-#define ARCH_PERFMON_EVENTSEL0_ENABLE  (1 << 22)
-#define ARCH_PERFMON_EVENTSEL_INT      (1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS       (1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR      (1 << 16)
-
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL  (0x3c)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK        (0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
-       (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
-
-union cpuid10_eax {
-       struct {
-               unsigned int version_id:8;
-               unsigned int num_counters:8;
-               unsigned int bit_width:8;
-               unsigned int mask_length:8;
-       } split;
-       unsigned int full;
-};
-
-#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h

index 9d826e436010005254bb04e13bac1ee45d802b75..daf866ed0612413f3781cdf6f863e8038daebe85 100644 (file)
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -154,22 +154,19 @@ extern int timer_through_8259;
  extern int io_apic_get_unique_id(int ioapic, int apic_id);
  extern int io_apic_get_version(int ioapic);
  extern int io_apic_get_redir_entries(int ioapic);
-extern int io_apic_set_pci_routing(int ioapic, int pin, int irq,
-                                  int edge_level, int active_high_low);
  #endif /* CONFIG_ACPI */
  
+struct io_apic_irq_attr;
+extern int io_apic_set_pci_routing(struct device *dev, int irq,
+                struct io_apic_irq_attr *irq_attr);
  extern int (*ioapic_renumber_irq)(int ioapic, int irq);
  extern void ioapic_init_mappings(void);
  
-#ifdef CONFIG_X86_64
  extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
  extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
  extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
  extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
  extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
-extern void reinit_intr_remapped_IO_APIC(int intr_remapping,
-       struct IO_APIC_route_entry **ioapic_entries);
-#endif
  
  extern void probe_nr_irqs_gsi(void);
  
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h

index 86af26091d6c3c2146da4e5f6f3abbfd6cb165ba..0e9fe1d9d9715db6a2fb952eff4b917465c3d64b 100644 (file)
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -1,3 +1,6 @@
+#ifndef _ASM_X86_IOMAP_H
+#define _ASM_X86_IOMAP_H
+
  /*
   * Copyright © 2008 Ingo Molnar
   *
@@ -31,3 +34,5 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
  
  void
  iounmap_atomic(void *kvaddr, enum km_type type);
+
+#endif /* _ASM_X86_IOMAP_H */
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h

index 0396760fccb85be05de0d5666c6019316d386d55..f275e2244505b98308ca72e66e26c250d29c61a8 100644 (file)
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -1,6 +1,6 @@
  #ifndef _ASM_X86_IRQ_REMAPPING_H
  #define _ASM_X86_IRQ_REMAPPING_H
  
-#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
+#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
  
  #endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h

index 3cbd79bbb47c82613341fca01ee6e174319342fe..e997be98c9b97c166b1b484f5a53b8f171f95b47 100644 (file)
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -34,6 +34,7 @@
  
  #ifdef CONFIG_X86_32
  # define SYSCALL_VECTOR                        0x80
+# define IA32_SYSCALL_VECTOR           0x80
  #else
  # define IA32_SYSCALL_VECTOR           0x80
  #endif
@@ -107,14 +108,14 @@
  #define LOCAL_TIMER_VECTOR             0xef
  
  /*
- * Performance monitoring interrupt vector:
+ * Generic system vector for platform specific use
   */
-#define LOCAL_PERF_VECTOR              0xee
+#define GENERIC_INTERRUPT_VECTOR       0xed
  
  /*
- * Generic system vector for platform specific use
+ * Performance monitoring pending work vector:
   */
-#define GENERIC_INTERRUPT_VECTOR       0xed
+#define LOCAL_PENDING_VECTOR           0xec
  
  /*
   * First APIC vector available to drivers: (vectors 0x30-0xee) we
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h

index 54c8cc53b24dd732829da23805c96f64b92fa168..c2d1f3b58e5f1342607280be6a71d434796482dd 100644 (file)
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -12,4 +12,17 @@ extern int cache_k8_northbridges(void);
  extern void k8_flush_garts(void);
  extern int k8_scan_nodes(unsigned long start, unsigned long end);
  
+#ifdef CONFIG_K8_NB
+static inline struct pci_dev *node_to_k8_nb_misc(int node)
+{
+       return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
+}
+#else
+static inline struct pci_dev *node_to_k8_nb_misc(int node)
+{
+       return NULL;
+}
+#endif
+
+
  #endif /* _ASM_X86_K8_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h

index c882664716c1d13e5678aa188fb0ccf75d50acca..ef51b501e22a6e53bf4ae7e2d9e2566760f72ee1 100644 (file)
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -9,20 +9,31 @@ struct cpu_signature {
  
  struct device;
  
+enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
+
  struct microcode_ops {
-       int  (*request_microcode_user) (int cpu, const void __user *buf, size_t size);
-       int  (*request_microcode_fw) (int cpu, struct device *device);
+       enum ucode_state (*request_microcode_user) (int cpu,
+                               const void __user *buf, size_t size);
  
-       void (*apply_microcode) (int cpu);
+       enum ucode_state (*request_microcode_fw) (int cpu,
+                               struct device *device);
  
-       int  (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
         void (*microcode_fini_cpu) (int cpu);
+
+       /*
+        * The generic 'microcode_core' part guarantees that
+        * the callbacks below run on a target cpu when they
+        * are being called.
+        * See also the "Synchronization" section in microcode_core.c.
+        */
+       int (*apply_microcode) (int cpu);
+       int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
  };
  
  struct ucode_cpu_info {
-       struct cpu_signature cpu_sig;
-       int valid;
-       void *mc;
+       struct cpu_signature    cpu_sig;
+       int                     valid;
+       void                    *mc;
  };
  extern struct ucode_cpu_info ucode_cpu_info[];
  
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h

index 642fc7fc8cdc3fe8aedea4ddb60cacabc7925983..e2a1bb6d71ea832f6a6dbb3fc9ad7b58c7c9e6c4 100644 (file)
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -61,9 +61,11 @@ extern void get_smp_config(void);
  #ifdef CONFIG_X86_MPPARSE
  extern void find_smp_config(void);
  extern void early_reserve_e820_mpc_new(void);
+extern int enable_update_mptable;
  #else
  static inline void find_smp_config(void) { }
  static inline void early_reserve_e820_mpc_new(void) { }
+#define enable_update_mptable 0
  #endif
  
  void __cpuinit generic_processor_info(int apicid, int version);
@@ -72,20 +74,13 @@ extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
  extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
                                    u32 gsi);
  extern void mp_config_acpi_legacy_irqs(void);
-extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low);
+struct device;
+extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
+                                int active_high_low);
  extern int acpi_probe_gsi(void);
  #ifdef CONFIG_X86_IO_APIC
-extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
-                               u32 gsi, int triggering, int polarity);
  extern int mp_find_ioapic(int gsi);
  extern int mp_find_ioapic_pin(int ioapic, int gsi);
-#else
-static inline int
-mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
-                  u32 gsi, int triggering, int polarity)
-{
-       return 0;
-}
  #endif
  #else /* !CONFIG_ACPI: */
  static inline int acpi_probe_gsi(void)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h

index ec41fc16c167cb19ef617f9a5e7ec2dda6f88bd1..4d58d04fca830bc56ab8bf62009661ad77a85f3e 100644 (file)
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -121,7 +121,6 @@
  #define MSR_K8_TOP_MEM1                        0xc001001a
  #define MSR_K8_TOP_MEM2                        0xc001001d
  #define MSR_K8_SYSCFG                  0xc0010010
-#define MSR_K8_HWCR                    0xc0010015
  #define MSR_K8_INT_PENDING_MSG         0xc0010055
  /* C1E active bits in int pending message */
  #define K8_INTP_C1E_ACTIVE_MASK                0x18000000
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h

index c45a0a568dff092227b1436a48e7dab5434dc4fa..c97264409934be915d24d35395c1c0af5956aa7b 100644 (file)
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -64,7 +64,7 @@ static inline int nmi_watchdog_active(void)
          * but since they are power of two we could use a
          * cheaper way --cvg
          */
-       return nmi_watchdog & 0x3;
+       return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
  }
  #endif
  
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h

index 064ed6df4cbec292758631a4a5a36aa3b77260c0..c4ae822e415f907093eabd2531d97506a891b691 100644 (file)
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -17,9 +17,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks,
  extern void numa_init_array(void);
  extern int numa_off;
  
-extern void srat_reserve_add_area(int nodeid);
-extern int hotadd_percent;
-
  extern s16 apicid_to_node[MAX_LOCAL_APIC];
  
  extern unsigned long numa_free_all_bootmem(void);
@@ -27,6 +24,13 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
                                unsigned long end);
  
  #ifdef CONFIG_NUMA
+/*
+ * Too small node sizes may confuse the VM badly. Usually they
+ * result from BIOS bugs. So dont recognize nodes as standalone
+ * NUMA entities that have less than this amount of RAM listed:
+ */
+#define NODE_MIN_SIZE (4*1024*1024)
+
  extern void __init init_cpu_to_node(void);
  extern void __cpuinit numa_set_node(int cpu, int node);
  extern void __cpuinit numa_clear_node(int cpu);
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h

index 0f915ae649a717e99ebdfc079652b25fda315e72..6f1b7331313f1af2744c50accfecd26e069a5bed 100644 (file)
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -54,10 +54,6 @@ extern unsigned int __VMALLOC_RESERVE;
  extern int sysctl_legacy_va_layout;
  
  extern void find_low_pfn_range(void);
-extern unsigned long init_memory_mapping(unsigned long start,
-                                        unsigned long end);
-extern void initmem_init(unsigned long, unsigned long);
-extern void free_initmem(void);
  extern void setup_bootmem_allocator(void);
  
  #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h

index d38c91b7024834a56be59db9c9b1bf411c698c50..8d382d3abf38cf9ec0be9c6164589d431017fc41 100644 (file)
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -32,22 +32,14 @@
   */
  #define __PAGE_OFFSET           _AC(0xffff880000000000, UL)
  
-#define __PHYSICAL_START       CONFIG_PHYSICAL_START
-#define __KERNEL_ALIGN         0x200000
-
-/*
- * Make sure kernel is aligned to 2MB address. Catching it at compile
- * time is better. Change your config file and compile the kernel
- * for a 2MB aligned address (CONFIG_PHYSICAL_START)
- */
-#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
-#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
-#endif
+#define __PHYSICAL_START       ((CONFIG_PHYSICAL_START +               \
+                                 (CONFIG_PHYSICAL_ALIGN - 1)) &        \
+                                ~(CONFIG_PHYSICAL_ALIGN - 1))
  
  #define __START_KERNEL         (__START_KERNEL_map + __PHYSICAL_START)
  #define __START_KERNEL_map     _AC(0xffffffff80000000, UL)
  
-/* See Documentation/x86_64/mm.txt for a description of the memory map. */
+/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
  #define __PHYSICAL_MASK_SHIFT  46
  #define __VIRTUAL_MASK_SHIFT   48
  
@@ -71,12 +63,6 @@ extern unsigned long __phys_addr(unsigned long);
  
  #define vmemmap ((struct page *)VMEMMAP_START)
  
-extern unsigned long init_memory_mapping(unsigned long start,
-                                        unsigned long end);
-
-extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
-extern void free_initmem(void);
-
  extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
  extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
  
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h

index 826ad37006ab1a075993ddbe69bd3281412b7f09..6473f5ccff859baf2c897ba073f629d14ba6d0ee 100644 (file)
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -46,6 +46,12 @@ extern int devmem_is_allowed(unsigned long pagenr);
  extern unsigned long max_low_pfn_mapped;
  extern unsigned long max_pfn_mapped;
  
+extern unsigned long init_memory_mapping(unsigned long start,
+                                        unsigned long end);
+
+extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
+extern void free_initmem(void);
+
  #endif /* !__ASSEMBLY__ */
  
  #endif /* _ASM_X86_PAGE_DEFS_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h

index a53da004e08ed8903dbf135fead7e3e09664089a..4fb37c8a0832da8cf51f7a2c771c54fa97af2708 100644 (file)
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -56,6 +56,7 @@ struct desc_ptr;
  struct tss_struct;
  struct mm_struct;
  struct desc_struct;
+struct task_struct;
  
  /*
   * Wrapper type for pointers to code which uses the non-standard
@@ -203,7 +204,8 @@ struct pv_cpu_ops {
  
         void (*swapgs)(void);
  
-       struct pv_lazy_ops lazy_mode;
+       void (*start_context_switch)(struct task_struct *prev);
+       void (*end_context_switch)(struct task_struct *next);
  };
  
  struct pv_irq_ops {
@@ -1399,25 +1401,23 @@ enum paravirt_lazy_mode {
  };
  
  enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
-void paravirt_enter_lazy_cpu(void);
-void paravirt_leave_lazy_cpu(void);
+void paravirt_start_context_switch(struct task_struct *prev);
+void paravirt_end_context_switch(struct task_struct *next);
+
  void paravirt_enter_lazy_mmu(void);
  void paravirt_leave_lazy_mmu(void);
-void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
  
-#define  __HAVE_ARCH_ENTER_LAZY_CPU_MODE
-static inline void arch_enter_lazy_cpu_mode(void)
+#define  __HAVE_ARCH_START_CONTEXT_SWITCH
+static inline void arch_start_context_switch(struct task_struct *prev)
  {
-       PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
+       PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev);
  }
  
-static inline void arch_leave_lazy_cpu_mode(void)
+static inline void arch_end_context_switch(struct task_struct *next)
  {
-       PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
+       PVOP_VCALL1(pv_cpu_ops.end_context_switch, next);
  }
  
-void arch_flush_lazy_cpu_mode(void);
-
  #define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
  static inline void arch_enter_lazy_mmu_mode(void)
  {
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h

new file mode 100644 (file)

index 0000000..876ed97
--- /dev/null
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,100 @@
+#ifndef _ASM_X86_PERF_COUNTER_H
+#define _ASM_X86_PERF_COUNTER_H
+
+/*
+ * Performance counter hw details:
+ */
+
+#define X86_PMC_MAX_GENERIC                                    8
+#define X86_PMC_MAX_FIXED                                      3
+
+#define X86_PMC_IDX_GENERIC                                    0
+#define X86_PMC_IDX_FIXED                                     32
+#define X86_PMC_IDX_MAX                                               64
+
+#define MSR_ARCH_PERFMON_PERFCTR0                            0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1                            0xc2
+
+#define MSR_ARCH_PERFMON_EVENTSEL0                          0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1                          0x187
+
+#define ARCH_PERFMON_EVENTSEL0_ENABLE                    (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_INT                        (1 << 20)
+#define ARCH_PERFMON_EVENTSEL_OS                         (1 << 17)
+#define ARCH_PERFMON_EVENTSEL_USR                        (1 << 16)
+
+/*
+ * Includes eventsel and unit mask as well:
+ */
+#define ARCH_PERFMON_EVENT_MASK                                    0xffff
+
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL                0x3c
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK                (0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX                 0
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
+               (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+
+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED                      6
+
+/*
+ * Intel "Architectural Performance Monitoring" CPUID
+ * detection/enumeration details:
+ */
+union cpuid10_eax {
+       struct {
+               unsigned int version_id:8;
+               unsigned int num_counters:8;
+               unsigned int bit_width:8;
+               unsigned int mask_length:8;
+       } split;
+       unsigned int full;
+};
+
+union cpuid10_edx {
+       struct {
+               unsigned int num_counters_fixed:4;
+               unsigned int reserved:28;
+       } split;
+       unsigned int full;
+};
+
+
+/*
+ * Fixed-purpose performance counters:
+ */
+
+/*
+ * All 3 fixed-mode PMCs are configured via this single MSR:
+ */
+#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL                        0x38d
+
+/*
+ * The counts are available in three separate MSRs:
+ */
+
+/* Instr_Retired.Any: */
+#define MSR_ARCH_PERFMON_FIXED_CTR0                    0x309
+#define X86_PMC_IDX_FIXED_INSTRUCTIONS                 (X86_PMC_IDX_FIXED + 0)
+
+/* CPU_CLK_Unhalted.Core: */
+#define MSR_ARCH_PERFMON_FIXED_CTR1                    0x30a
+#define X86_PMC_IDX_FIXED_CPU_CYCLES                   (X86_PMC_IDX_FIXED + 1)
+
+/* CPU_CLK_Unhalted.Ref: */
+#define MSR_ARCH_PERFMON_FIXED_CTR2                    0x30b
+#define X86_PMC_IDX_FIXED_BUS_CYCLES                   (X86_PMC_IDX_FIXED + 2)
+
+extern void set_perf_counter_pending(void);
+
+#define clear_perf_counter_pending()   do { } while (0)
+#define test_perf_counter_pending()    (0)
+
+#ifdef CONFIG_PERF_COUNTERS
+extern void init_hw_perf_counters(void);
+extern void perf_counters_lapic_init(void);
+#else
+static inline void init_hw_perf_counters(void)         { }
+static inline void perf_counters_lapic_init(void)      { }
+#endif
+
+#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h

index 29d96d168bc097195e9cbe66ab723c90929e1116..18ef7ebf2631709dad4f26a681fdbf5dff5a8b2b 100644 (file)
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -81,6 +81,8 @@ static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
  #define pte_val(x)     native_pte_val(x)
  #define __pte(x)       native_make_pte(x)
  
+#define arch_end_context_switch(prev)  do {} while(0)
+
  #endif /* CONFIG_PARAVIRT */
  
  /*
@@ -503,6 +505,8 @@ static inline int pgd_none(pgd_t pgd)
  
  #ifndef __ASSEMBLY__
  
+extern int direct_gbpages;
+
  /* local pte updates need not use xchg for locking */
  static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
  {
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h

index 6b87bc6d50189263691c10819618bc97deb8c086..abde308fdb0f54e07587e599998c31c4cc45dc66 100644 (file)
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -25,10 +25,6 @@ extern pgd_t init_level4_pgt[];
  
  extern void paging_init(void);
  
-#endif /* !__ASSEMBLY__ */
-
-#ifndef __ASSEMBLY__
-
  #define pte_ERROR(e)                                   \
         printk("%s:%d: bad pte %p(%016lx).\n",          \
                __FILE__, __LINE__, &(e), pte_val(e))
@@ -135,8 +131,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
  
  #define update_mmu_cache(vma, address, pte) do { } while (0)
  
-extern int direct_gbpages;
-
  /* Encode and de-code a swap entry */
  #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
  #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h

index fbf42b8e03833d0537b5e8594a4a3f06a154e180..766ea16fbbbda730a9952a67d85c8cda954f62ef 100644 (file)
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -51,11 +51,11 @@ typedef struct { pteval_t pte; } pte_t;
  #define PGDIR_SIZE     (_AC(1, UL) << PGDIR_SHIFT)
  #define PGDIR_MASK     (~(PGDIR_SIZE - 1))
  
-
+/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
  #define MAXMEM          _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
-#define VMALLOC_START    _AC(0xffffc20000000000, UL)
-#define VMALLOC_END      _AC(0xffffe1ffffffffff, UL)
-#define VMEMMAP_START   _AC(0xffffe20000000000, UL)
+#define VMALLOC_START    _AC(0xffffc90000000000, UL)
+#define VMALLOC_END      _AC(0xffffe8ffffffffff, UL)
+#define VMEMMAP_START   _AC(0xffffea0000000000, UL)
  #define MODULES_VADDR    _AC(0xffffffffa0000000, UL)
  #define MODULES_END      _AC(0xffffffffff000000, UL)
  #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h

index b8238dc8786d9f6b4291c2d7f912abbc0ea04e18..4d258ad76a0fc04925ac484c1fa9b0bfc47a1cb9 100644 (file)
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,7 +273,6 @@ typedef struct page *pgtable_t;
  
  extern pteval_t __supported_pte_mask;
  extern int nx_enabled;
-extern void set_nx(void);
  
  #define pgprot_writecombine    pgprot_writecombine
  extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h

index c2cceae709c8655338894c15b84c361e65caf727..c7768269b1cf1364b76c948d886deee10de2cf70 100644 (file)
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -135,7 +135,8 @@ extern struct cpuinfo_x86   boot_cpu_data;
  extern struct cpuinfo_x86      new_cpu_data;
  
  extern struct tss_struct       doublefault_tss;
-extern __u32                   cleared_cpu_caps[NCAPINTS];
+extern __u32                   cpu_caps_cleared[NCAPINTS];
+extern __u32                   cpu_caps_set[NCAPINTS];
  
  #ifdef CONFIG_SMP
  DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
@@ -409,9 +410,6 @@ DECLARE_PER_CPU(unsigned long, stack_canary);
  extern unsigned int xstate_size;
  extern void free_thread_xstate(struct task_struct *);
  extern struct kmem_cache *task_xstate_cachep;
-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
-extern unsigned short num_cache_leaves;
  
  struct thread_struct {
         /* Cached TLS descriptors: */
@@ -427,8 +425,12 @@ struct thread_struct {
         unsigned short          fsindex;
         unsigned short          gsindex;
  #endif
+#ifdef CONFIG_X86_32
         unsigned long           ip;
+#endif
+#ifdef CONFIG_X86_64
         unsigned long           fs;
+#endif
         unsigned long           gs;
         /* Hardware debugging registers: */
         unsigned long           debugreg0;
@@ -460,14 +462,8 @@ struct thread_struct {
         unsigned                io_bitmap_max;
  /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set.  */
         unsigned long   debugctlmsr;
-#ifdef CONFIG_X86_DS
-/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */
+       /* Debug Store context; see asm/ds.h */
         struct ds_context       *ds_ctx;
-#endif /* CONFIG_X86_DS */
-#ifdef CONFIG_X86_PTRACE_BTS
-/* the signal to send on a bts buffer overflow */
-       unsigned int    bts_ovfl_signal;
-#endif /* CONFIG_X86_PTRACE_BTS */
  };
  
  static inline unsigned long native_get_debugreg(int regno)
@@ -795,6 +791,21 @@ static inline unsigned long get_debugctlmsr(void)
      return debugctlmsr;
  }
  
+static inline unsigned long get_debugctlmsr_on_cpu(int cpu)
+{
+       u64 debugctlmsr = 0;
+       u32 val1, val2;
+
+#ifndef CONFIG_X86_DEBUGCTLMSR
+       if (boot_cpu_data.x86 < 6)
+               return 0;
+#endif
+       rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2);
+       debugctlmsr = val1 | ((u64)val2 << 32);
+
+       return debugctlmsr;
+}
+
  static inline void update_debugctlmsr(unsigned long debugctlmsr)
  {
  #ifndef CONFIG_X86_DEBUGCTLMSR
@@ -804,6 +815,18 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
         wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
  }
  
+static inline void update_debugctlmsr_on_cpu(int cpu,
+                                            unsigned long debugctlmsr)
+{
+#ifndef CONFIG_X86_DEBUGCTLMSR
+       if (boot_cpu_data.x86 < 6)
+               return;
+#endif
+       wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR,
+                    (u32)((u64)debugctlmsr),
+                    (u32)((u64)debugctlmsr >> 32));
+}
+
  /*
   * from system description table in BIOS. Mostly for MCA use, but
   * others may find it useful:
@@ -814,6 +837,7 @@ extern unsigned int         BIOS_revision;
  
  /* Boot loader type from the setup header: */
  extern int                     bootloader_type;
+extern int                     bootloader_version;
  
  extern char                    ignore_fpu_irq;
  
@@ -874,7 +898,6 @@ static inline void spin_lock_prefetch(const void *x)
         .vm86_info              = NULL,                                   \
         .sysenter_cs            = __KERNEL_CS,                            \
         .io_bitmap_ptr          = NULL,                                   \
-       .fs                     = __KERNEL_PERCPU,                        \
  }
  
  /*
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h

index 624f133943ed71293aaba1477b9ca20ad0836da8..0f0d908349aa3f375e87f68802cbcb2e7753f725 100644 (file)
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -236,12 +236,11 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
  extern int do_set_thread_area(struct task_struct *p, int idx,
                               struct user_desc __user *info, int can_allocate);
  
-extern void x86_ptrace_untrace(struct task_struct *);
-extern void x86_ptrace_fork(struct task_struct *child,
-                           unsigned long clone_flags);
+#ifdef CONFIG_X86_PTRACE_BTS
+extern void ptrace_bts_untrace(struct task_struct *tsk);
  
-#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk)
-#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags)
+#define arch_ptrace_untrace(tsk)       ptrace_bts_untrace(tsk)
+#endif /* CONFIG_X86_PTRACE_BTS */
  
  #endif /* __KERNEL__ */
  
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h

index a4737dddfd5878da05d20fb819592fc58a117a47..64cf2d24fad1c2605c16b528cc10e8e37c9bc77c 100644 (file)
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -48,9 +48,15 @@
  #endif
  
  #ifdef CONFIG_X86_64
+#ifdef CONFIG_PARAVIRT
+/* Paravirtualized systems may not have PSE or PGE available */
  #define NEED_PSE       0
-#define NEED_MSR       (1<<(X86_FEATURE_MSR & 31))
  #define NEED_PGE       0
+#else
+#define NEED_PSE       (1<<(X86_FEATURE_PSE) & 31)
+#define NEED_PGE       (1<<(X86_FEATURE_PGE) & 31)
+#endif
+#define NEED_MSR       (1<<(X86_FEATURE_MSR & 31))
  #define NEED_FXSR      (1<<(X86_FEATURE_FXSR & 31))
  #define NEED_XMM       (1<<(X86_FEATURE_XMM & 31))
  #define NEED_XMM2      (1<<(X86_FEATURE_XMM2 & 31))
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h

index bdc2ada05ae06056ad95e79bd8d6434d73510f5a..4093d1ed6db2a0b3eebc2983337a1592b673d5bc 100644 (file)
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -33,7 +33,6 @@ struct x86_quirks {
         int (*setup_ioapic_ids)(void);
  };
  
-extern void x86_quirk_pre_intr_init(void);
  extern void x86_quirk_intr_init(void);
  
  extern void x86_quirk_trap_init(void);
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h

index 19e0d88b966d7b9154f46d9426450765dcf3d389..6a84ed166aec136334a644cc4fbaa676368ae983 100644 (file)
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -180,7 +180,7 @@ extern int safe_smp_processor_id(void);
  static inline int logical_smp_processor_id(void)
  {
         /* we don't want to mark this access volatile - bad code generation */
-       return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
+       return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
  }
  
  #endif
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h

index e3cc3c063ec5e77683c088fd80906de90fbf9bd2..4517d6b93188aa55a775be95561b9f4bee045d42 100644 (file)
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -27,7 +27,7 @@
  #else /* CONFIG_X86_32 */
  # define SECTION_SIZE_BITS     27 /* matt - 128 is convenient right now */
  # define MAX_PHYSADDR_BITS     44
-# define MAX_PHYSMEM_BITS      44 /* Can be max 45 bits */
+# define MAX_PHYSMEM_BITS      46
  #endif
  
  #endif /* CONFIG_SPARSEMEM */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h

index 7043408f6904a9f2d94c817b3488f80fbb4976df..372b76edd63f69053c76bcf7501eb71c35bb8f93 100644 (file)
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -1,7 +1,7 @@
  /*
   * syscalls.h - Linux syscall interfaces (arch-specific)
   *
- * Copyright (c) 2008 Jaswinder Singh
+ * Copyright (c) 2008 Jaswinder Singh Rajput
   *
   * This file is released under the GPLv2.
   * See the file COPYING for more details.
@@ -12,50 +12,55 @@
  
  #include <linux/compiler.h>
  #include <linux/linkage.h>
-#include <linux/types.h>
  #include <linux/signal.h>
+#include <linux/types.h>
  
  /* Common in X86_32 and X86_64 */
  /* kernel/ioport.c */
  asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
  
+/* kernel/process.c */
+int sys_fork(struct pt_regs *);
+int sys_vfork(struct pt_regs *);
+
  /* kernel/ldt.c */
  asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
  
+/* kernel/signal.c */
+long sys_rt_sigreturn(struct pt_regs *);
+
  /* kernel/tls.c */
  asmlinkage int sys_set_thread_area(struct user_desc __user *);
  asmlinkage int sys_get_thread_area(struct user_desc __user *);
  
  /* X86_32 only */
  #ifdef CONFIG_X86_32
+/* kernel/ioport.c */
+long sys_iopl(struct pt_regs *);
+
  /* kernel/process_32.c */
-int sys_fork(struct pt_regs *);
  int sys_clone(struct pt_regs *);
-int sys_vfork(struct pt_regs *);
  int sys_execve(struct pt_regs *);
  
-/* kernel/signal_32.c */
+/* kernel/signal.c */
  asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
  asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
                              struct old_sigaction __user *);
  int sys_sigaltstack(struct pt_regs *);
  unsigned long sys_sigreturn(struct pt_regs *);
-long sys_rt_sigreturn(struct pt_regs *);
-
-/* kernel/ioport.c */
-long sys_iopl(struct pt_regs *);
  
  /* kernel/sys_i386_32.c */
+struct mmap_arg_struct;
+struct sel_arg_struct;
+struct oldold_utsname;
+struct old_utsname;
+
  asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
                           unsigned long, unsigned long, unsigned long);
-struct mmap_arg_struct;
  asmlinkage int old_mmap(struct mmap_arg_struct __user *);
-struct sel_arg_struct;
  asmlinkage int old_select(struct sel_arg_struct __user *);
  asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
-struct old_utsname;
  asmlinkage int sys_uname(struct old_utsname __user *);
-struct oldold_utsname;
  asmlinkage int sys_olduname(struct oldold_utsname __user *);
  
  /* kernel/vm86_32.c */
@@ -65,29 +70,27 @@ int sys_vm86(struct pt_regs *);
  #else /* CONFIG_X86_32 */
  
  /* X86_64 only */
+/* kernel/ioport.c */
+asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
+
  /* kernel/process_64.c */
-asmlinkage long sys_fork(struct pt_regs *);
  asmlinkage long sys_clone(unsigned long, unsigned long,
                           void __user *, void __user *,
                           struct pt_regs *);
-asmlinkage long sys_vfork(struct pt_regs *);
  asmlinkage long sys_execve(char __user *, char __user * __user *,
                            char __user * __user *,
                            struct pt_regs *);
  long sys_arch_prctl(int, unsigned long);
  
-/* kernel/ioport.c */
-asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
-
-/* kernel/signal_64.c */
+/* kernel/signal.c */
  asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
                                 struct pt_regs *);
-long sys_rt_sigreturn(struct pt_regs *);
  
  /* kernel/sys_x86_64.c */
+struct new_utsname;
+
  asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
                          unsigned long, unsigned long, unsigned long);
-struct new_utsname;
  asmlinkage long sys_uname(struct new_utsname __user *);
  
  #endif /* CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h

index 8820a73ae090aae29aa3454d6f3241ffcbef5440..602c769fc98ca7340f5388fe21e57ce5431b376e 100644 (file)
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -94,7 +94,8 @@ struct thread_info {
  #define TIF_FORCED_TF          24      /* true if TF in eflags artificially */
  #define TIF_DEBUGCTLMSR                25      /* uses thread_struct.debugctlmsr */
  #define TIF_DS_AREA_MSR                26      /* uses thread_struct.ds_area_msr */
-#define TIF_SYSCALL_FTRACE     27      /* for ftrace syscall instrumentation */
+#define TIF_LAZY_MMU_UPDATES   27      /* task is updating the mmu lazily */
+#define TIF_SYSCALL_FTRACE     28      /* for ftrace syscall instrumentation */
  
  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
@@ -116,6 +117,7 @@ struct thread_info {
  #define _TIF_FORCED_TF         (1 << TIF_FORCED_TF)
  #define _TIF_DEBUGCTLMSR       (1 << TIF_DEBUGCTLMSR)
  #define _TIF_DS_AREA_MSR       (1 << TIF_DS_AREA_MSR)
+#define _TIF_LAZY_MMU_UPDATES  (1 << TIF_LAZY_MMU_UPDATES)
  #define _TIF_SYSCALL_FTRACE    (1 << TIF_SYSCALL_FTRACE)
  
  /* work to do in syscall_trace_enter() */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h

index 16a5c84b032997f264f704ac68537dc65ba1dbe6..a5ecc9c33e920eda5e50bcfe0313b333b1b2cd31 100644 (file)
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -17,7 +17,7 @@
  
  static inline void __native_flush_tlb(void)
  {
-       write_cr3(read_cr3());
+       native_write_cr3(native_read_cr3());
  }
  
  static inline void __native_flush_tlb_global(void)
@@ -32,11 +32,11 @@ static inline void __native_flush_tlb_global(void)
          */
         raw_local_irq_save(flags);
  
-       cr4 = read_cr4();
+       cr4 = native_read_cr4();
         /* clear PGE */
-       write_cr4(cr4 & ~X86_CR4_PGE);
+       native_write_cr4(cr4 & ~X86_CR4_PGE);
         /* write old PGE again and flush TLBs */
-       write_cr4(cr4);
+       native_write_cr4(cr4);
  
         raw_local_irq_restore(flags);
  }
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h

index f44b49abca49b93c1f9386f65e0b2b5df118a354..066ef590d7e054b7ac7c55c99d235a0b4bb73896 100644 (file)
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -203,7 +203,8 @@ struct pci_bus;
  void x86_pci_root_bus_res_quirks(struct pci_bus *b);
  
  #ifdef CONFIG_SMP
-#define mc_capable()   (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids)
+#define mc_capable()   ((boot_cpu_data.x86_max_cores > 1) && \
+                       (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids))
  #define smt_capable()                  (smp_num_siblings > 1)
  #endif
  
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h

index 0d5342515b8696970ec533572055df8b98ef74a9..bfd74c032fcaa33b7c50b066fbc5f9333ca6387d 100644 (file)
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -2,6 +2,7 @@
  #define _ASM_X86_TRAPS_H
  
  #include <asm/debugreg.h>
+#include <asm/siginfo.h>                       /* TRAP_TRACE, ... */
  
  #ifdef CONFIG_X86_32
  #define dotraplinkage
@@ -13,6 +14,9 @@ asmlinkage void divide_error(void);
  asmlinkage void debug(void);
  asmlinkage void nmi(void);
  asmlinkage void int3(void);
+asmlinkage void xen_debug(void);
+asmlinkage void xen_int3(void);
+asmlinkage void xen_stack_segment(void);
  asmlinkage void overflow(void);
  asmlinkage void bounds(void);
  asmlinkage void invalid_op(void);
@@ -74,7 +78,6 @@ static inline int get_si_code(unsigned long condition)
  }
  
  extern int panic_on_unrecovered_nmi;
-extern int kstack_depth_to_print;
  
  void math_error(void __user *);
  void math_emulate(struct math_emu_info *);
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h

index 6e72d74cf8dc74b7720f5cb79ba355a926e7fa6e..732a307061537241da1ec570e6bf03219623e5c0 100644 (file)
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -340,6 +340,8 @@
  #define __NR_inotify_init1     332
  #define __NR_preadv            333
  #define __NR_pwritev           334
+#define __NR_rt_tgsigqueueinfo 335
+#define __NR_perf_counter_open 336
  
  #ifdef __KERNEL__
  
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h

index f81829462325f6328a6e6d9c3667da02e9f616d0..900e1617e6722f8ecf2bac1f90af9286c39c54d9 100644 (file)
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -657,7 +657,10 @@ __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
  __SYSCALL(__NR_preadv, sys_preadv)
  #define __NR_pwritev                           296
  __SYSCALL(__NR_pwritev, sys_pwritev)
-
+#define __NR_rt_tgsigqueueinfo                 297
+__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
+#define __NR_perf_counter_open                 298
+__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
  
  #ifndef __NO_STUBS
  #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h

index 9b0e61bf7a88dc2a68521384ea68950cd900478e..bddd44f2f0ab5b3dc7dc0e8ad19fe6df52a0525a 100644 (file)
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -37,7 +37,7 @@
  #define UV_CPUS_PER_ACT_STATUS         32
  #define UV_ACT_STATUS_MASK             0x3
  #define UV_ACT_STATUS_SIZE             2
-#define UV_ACTIVATION_DESCRIPTOR_SIZE  32
+#define UV_ADP_SIZE                    32
  #define UV_DISTRIBUTION_SIZE           256
  #define UV_SW_ACK_NPENDING             8
  #define UV_NET_ENDPOINT_INTD           0x38
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h

index d3a98ea1062ef936c7825f1387a4d2f4b83c65b9..341070f7ad5cb62679f22f7b3fff7316c549b5d9 100644 (file)
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -133,6 +133,7 @@ struct uv_scir_s {
  struct uv_hub_info_s {
         unsigned long           global_mmr_base;
         unsigned long           gpa_mask;
+       unsigned int            gnode_extra;
         unsigned long           gnode_upper;
         unsigned long           lowmem_remap_top;
         unsigned long           lowmem_remap_base;
@@ -159,7 +160,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
   *             p -  PNODE (local part of nsids, right shifted 1)
   */
  #define UV_NASID_TO_PNODE(n)           (((n) >> 1) & uv_hub_info->pnode_mask)
-#define UV_PNODE_TO_NASID(p)           (((p) << 1) | uv_hub_info->gnode_upper)
+#define UV_PNODE_TO_GNODE(p)           ((p) |uv_hub_info->gnode_extra)
+#define UV_PNODE_TO_NASID(p)           (UV_PNODE_TO_GNODE(p) << 1)
  
  #define UV_LOCAL_MMR_BASE              0xf4000000UL
  #define UV_GLOBAL_MMR32_BASE           0xf8000000UL
@@ -173,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
  #define UV_GLOBAL_MMR32_PNODE_BITS(p)  ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
  
  #define UV_GLOBAL_MMR64_PNODE_BITS(p)                                  \
-       ((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT)
+       ((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
  
  #define UV_APIC_PNODE_SHIFT    6
  
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile

index 88d1bfc847d30fc6b87007648fedc9970856e1af..4f78bd682125067c07fbf9c11a1b1bc1c40a2dd6 100644 (file)
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -28,7 +28,7 @@ CFLAGS_paravirt.o     := $(nostackp)
  obj-y                  := process_$(BITS).o signal.o entry_$(BITS).o
  obj-y                  += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
  obj-y                  += time_$(BITS).o ioport.o ldt.o dumpstack.o
-obj-y                  += setup.o i8259.o irqinit_$(BITS).o
+obj-y                  += setup.o i8259.o irqinit.o
  obj-$(CONFIG_X86_VISWS)        += visws_quirks.o
  obj-$(CONFIG_X86_32)   += probe_roms_32.o
  obj-$(CONFIG_X86_32)   += sys_i386_32.o i386_ksyms_32.o
@@ -44,6 +44,7 @@ obj-y                         += process.o
  obj-y                          += i387.o xsave.o
  obj-y                          += ptrace.o
  obj-$(CONFIG_X86_DS)           += ds.o
+obj-$(CONFIG_X86_DS_SELFTEST)          += ds_selftest.o
  obj-$(CONFIG_X86_32)           += tls.o
  obj-$(CONFIG_IA32_EMULATION)   += tls.o
  obj-y                          += step.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c

index 723989d7f8029711c0a6fda92e8016101ac2ede7..631086159c53b0be5f28df92d3e2ed976cffa7a8 100644 (file)
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -33,6 +33,7 @@
  #include <linux/irq.h>
  #include <linux/bootmem.h>
  #include <linux/ioport.h>
+#include <linux/pci.h>
  
  #include <asm/pgtable.h>
  #include <asm/io_apic.h>
@@ -522,7 +523,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
   * success: return IRQ number (>=0)
   * failure: return < 0
   */
-int acpi_register_gsi(u32 gsi, int triggering, int polarity)
+int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
  {
         unsigned int irq;
         unsigned int plat_gsi = gsi;
@@ -532,14 +533,14 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity)
          * Make sure all (legacy) PCI IRQs are set as level-triggered.
          */
         if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
-               if (triggering == ACPI_LEVEL_SENSITIVE)
+               if (trigger == ACPI_LEVEL_SENSITIVE)
                         eisa_set_level_irq(gsi);
         }
  #endif
  
  #ifdef CONFIG_X86_IO_APIC
         if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
-               plat_gsi = mp_register_gsi(gsi, triggering, polarity);
+               plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
         }
  #endif
         acpi_gsi_to_irq(plat_gsi, &irq);
@@ -903,10 +904,8 @@ extern int es7000_plat;
  #endif
  
  static struct {
-       int apic_id;
         int gsi_base;
         int gsi_end;
-       DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
  } mp_ioapic_routing[MAX_IO_APICS];
  
  int mp_find_ioapic(int gsi)
@@ -986,16 +985,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
  
         set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
         mp_ioapics[idx].apicid = uniq_ioapic_id(id);
-#ifdef CONFIG_X86_32
         mp_ioapics[idx].apicver = io_apic_get_version(idx);
-#else
-       mp_ioapics[idx].apicver = 0;
-#endif
+
         /*
          * Build basic GSI lookup table to facilitate gsi->io_apic lookups
          * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
          */
-       mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;
         mp_ioapic_routing[idx].gsi_base = gsi_base;
         mp_ioapic_routing[idx].gsi_end = gsi_base +
             io_apic_get_redir_entries(idx);
@@ -1158,26 +1153,52 @@ void __init mp_config_acpi_legacy_irqs(void)
         }
  }
  
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
+static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
+                       int polarity)
  {
+#ifdef CONFIG_X86_MPPARSE
+       struct mpc_intsrc mp_irq;
+       struct pci_dev *pdev;
+       unsigned char number;
+       unsigned int devfn;
         int ioapic;
-       int ioapic_pin;
-#ifdef CONFIG_X86_32
-#define MAX_GSI_NUM    4096
-#define IRQ_COMPRESSION_START  64
+       u8 pin;
  
-       static int pci_irq = IRQ_COMPRESSION_START;
-       /*
-        * Mapping between Global System Interrupts, which
-        * represent all possible interrupts, and IRQs
-        * assigned to actual devices.
-        */
-       static int gsi_to_irq[MAX_GSI_NUM];
-#else
+       if (!acpi_ioapic)
+               return 0;
+       if (!dev)
+               return 0;
+       if (dev->bus != &pci_bus_type)
+               return 0;
+
+       pdev = to_pci_dev(dev);
+       number = pdev->bus->number;
+       devfn = pdev->devfn;
+       pin = pdev->pin;
+       /* print the entry should happen on mptable identically */
+       mp_irq.type = MP_INTSRC;
+       mp_irq.irqtype = mp_INT;
+       mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+                               (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+       mp_irq.srcbus = number;
+       mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+       ioapic = mp_find_ioapic(gsi);
+       mp_irq.dstapic = mp_ioapics[ioapic].apicid;
+       mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
+
+       save_mp_irq(&mp_irq);
+#endif
+       return 0;
+}
+
+int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+{
+       int ioapic;
+       int ioapic_pin;
+       struct io_apic_irq_attr irq_attr;
  
         if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
                 return gsi;
-#endif
  
         /* Don't set up the ACPI SCI because it's already set up */
         if (acpi_gbl_FADT.sci_interrupt == gsi)
@@ -1196,93 +1217,22 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
                 gsi = ioapic_renumber_irq(ioapic, gsi);
  #endif
  
-       /*
-        * Avoid pin reprogramming.  PRTs typically include entries
-        * with redundant pin->gsi mappings (but unique PCI devices);
-        * we only program the IOAPIC on the first.
-        */
         if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
                 printk(KERN_ERR "Invalid reference to IOAPIC pin "
-                      "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
+                      "%d-%d\n", mp_ioapics[ioapic].apicid,
                        ioapic_pin);
                 return gsi;
         }
-       if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
-               pr_debug("Pin %d-%d already programmed\n",
-                        mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-#ifdef CONFIG_X86_32
-               return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
-#else
-               return gsi;
-#endif
-       }
-
-       set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
-#ifdef CONFIG_X86_32
-       /*
-        * For GSI >= 64, use IRQ compression
-        */
-       if ((gsi >= IRQ_COMPRESSION_START)
-           && (triggering == ACPI_LEVEL_SENSITIVE)) {
-               /*
-                * For PCI devices assign IRQs in order, avoiding gaps
-                * due to unused I/O APIC pins.
-                */
-               int irq = gsi;
-               if (gsi < MAX_GSI_NUM) {
-                       /*
-                        * Retain the VIA chipset work-around (gsi > 15), but
-                        * avoid a problem where the 8254 timer (IRQ0) is setup
-                        * via an override (so it's not on pin 0 of the ioapic),
-                        * and at the same time, the pin 0 interrupt is a PCI
-                        * type.  The gsi > 15 test could cause these two pins
-                        * to be shared as IRQ0, and they are not shareable.
-                        * So test for this condition, and if necessary, avoid
-                        * the pin collision.
-                        */
-                       gsi = pci_irq++;
-                       /*
-                        * Don't assign IRQ used by ACPI SCI
-                        */
-                       if (gsi == acpi_gbl_FADT.sci_interrupt)
-                               gsi = pci_irq++;
-                       gsi_to_irq[irq] = gsi;
-               } else {
-                       printk(KERN_ERR "GSI %u is too high\n", gsi);
-                       return gsi;
-               }
-       }
-#endif
-       io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
-                               triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
-                               polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-       return gsi;
-}
  
-int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
-                       u32 gsi, int triggering, int polarity)
-{
-#ifdef CONFIG_X86_MPPARSE
-       struct mpc_intsrc mp_irq;
-       int ioapic;
+       if (enable_update_mptable)
+               mp_config_acpi_gsi(dev, gsi, trigger, polarity);
  
-       if (!acpi_ioapic)
-               return 0;
+       set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
+                            trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
+                            polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+       io_apic_set_pci_routing(dev, gsi, &irq_attr);
  
-       /* print the entry should happen on mptable identically */
-       mp_irq.type = MP_INTSRC;
-       mp_irq.irqtype = mp_INT;
-       mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
-                               (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
-       mp_irq.srcbus = number;
-       mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
-       ioapic = mp_find_ioapic(gsi);
-       mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
-       mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
-
-       save_mp_irq(&mp_irq);
-#endif
-       return 0;
+       return gsi;
  }
  
  /*
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile

index 1c31cc0e9def070e26288672801396d042fa604a..167bc16ce0e59ca080112db6d232f270df7ceea0 100644 (file)
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ b/arch/x86/kernel/acpi/realmode/Makefile
@@ -9,7 +9,7 @@
  always         := wakeup.bin
  targets                := wakeup.elf wakeup.lds
  
-wakeup-y       += wakeup.o wakemain.o video-mode.o copy.o
+wakeup-y       += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
  
  # The link order of the video-*.o modules can matter.  In particular,
  # video-vga.o *must* be listed first, followed by video-vesa.o.
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S

new file mode 100644 (file)

index 0000000..f51eb0b
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/bioscall.S
@@ -0,0 +1 @@
+#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c

new file mode 100644 (file)

index 0000000..6206033
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/regs.c
@@ -0,0 +1 @@
+#include "../../../boot/regs.c"
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c

index a97db99dad52fedd4bbcf6fe9d46bed8784f5e1f..1c60554537c358ef37fa9352e5bfd624dadf3e1a 100644 (file)
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -55,7 +55,16 @@ struct iommu_cmd {
  static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
                              struct unity_map_entry *e);
  static struct dma_ops_domain *find_protection_domain(u16 devid);
+static u64* alloc_pte(struct protection_domain *dom,
+                     unsigned long address, u64
+                     **pte_page, gfp_t gfp);
+static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
+                                     unsigned long start_page,
+                                     unsigned int pages);
  
+#ifndef BUS_NOTIFY_UNBOUND_DRIVER
+#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
+#endif
  
  #ifdef CONFIG_AMD_IOMMU_STATS
  
@@ -213,7 +222,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
  {
         struct amd_iommu *iommu;
  
-       list_for_each_entry(iommu, &amd_iommu_list, list)
+       for_each_iommu(iommu)
                 iommu_poll_events(iommu);
  
         return IRQ_HANDLED;
@@ -440,7 +449,7 @@ static void iommu_flush_domain(u16 domid)
         __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
                                       domid, 1, 1);
  
-       list_for_each_entry(iommu, &amd_iommu_list, list) {
+       for_each_iommu(iommu) {
                 spin_lock_irqsave(&iommu->lock, flags);
                 __iommu_queue_command(iommu, &cmd);
                 __iommu_completion_wait(iommu);
@@ -449,6 +458,35 @@ static void iommu_flush_domain(u16 domid)
         }
  }
  
+void amd_iommu_flush_all_domains(void)
+{
+       int i;
+
+       for (i = 1; i < MAX_DOMAIN_ID; ++i) {
+               if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
+                       continue;
+               iommu_flush_domain(i);
+       }
+}
+
+void amd_iommu_flush_all_devices(void)
+{
+       struct amd_iommu *iommu;
+       int i;
+
+       for (i = 0; i <= amd_iommu_last_bdf; ++i) {
+               if (amd_iommu_pd_table[i] == NULL)
+                       continue;
+
+               iommu = amd_iommu_rlookup_table[i];
+               if (!iommu)
+                       continue;
+
+               iommu_queue_inv_dev_entry(iommu, i);
+               iommu_completion_wait(iommu);
+       }
+}
+
  /****************************************************************************
   *
   * The functions below are used the create the page table mappings for
@@ -468,7 +506,7 @@ static int iommu_map_page(struct protection_domain *dom,
                           unsigned long phys_addr,
                           int prot)
  {
-       u64 __pte, *pte, *page;
+       u64 __pte, *pte;
  
         bus_addr  = PAGE_ALIGN(bus_addr);
         phys_addr = PAGE_ALIGN(phys_addr);
@@ -477,27 +515,7 @@ static int iommu_map_page(struct protection_domain *dom,
         if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
                 return -EINVAL;
  
-       pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
-
-       if (!IOMMU_PTE_PRESENT(*pte)) {
-               page = (u64 *)get_zeroed_page(GFP_KERNEL);
-               if (!page)
-                       return -ENOMEM;
-               *pte = IOMMU_L2_PDE(virt_to_phys(page));
-       }
-
-       pte = IOMMU_PTE_PAGE(*pte);
-       pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
-
-       if (!IOMMU_PTE_PRESENT(*pte)) {
-               page = (u64 *)get_zeroed_page(GFP_KERNEL);
-               if (!page)
-                       return -ENOMEM;
-               *pte = IOMMU_L1_PDE(virt_to_phys(page));
-       }
-
-       pte = IOMMU_PTE_PAGE(*pte);
-       pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
+       pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);
  
         if (IOMMU_PTE_PRESENT(*pte))
                 return -EBUSY;
@@ -595,7 +613,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
                  * as allocated in the aperture
                  */
                 if (addr < dma_dom->aperture_size)
-                       __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap);
+                       __set_bit(addr >> PAGE_SHIFT,
+                                 dma_dom->aperture[0]->bitmap);
         }
  
         return 0;
@@ -632,42 +651,191 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
   ****************************************************************************/
  
  /*
- * The address allocator core function.
+ * The address allocator core functions.
   *
   * called with domain->lock held
   */
+
+/*
+ * This function checks if there is a PTE for a given dma address. If
+ * there is one, it returns the pointer to it.
+ */
+static u64* fetch_pte(struct protection_domain *domain,
+                     unsigned long address)
+{
+       u64 *pte;
+
+       pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)];
+
+       if (!IOMMU_PTE_PRESENT(*pte))
+               return NULL;
+
+       pte = IOMMU_PTE_PAGE(*pte);
+       pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+
+       if (!IOMMU_PTE_PRESENT(*pte))
+               return NULL;
+
+       pte = IOMMU_PTE_PAGE(*pte);
+       pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+
+       return pte;
+}
+
+/*
+ * This function is used to add a new aperture range to an existing
+ * aperture in case of dma_ops domain allocation or address allocation
+ * failure.
+ */
+static int alloc_new_range(struct amd_iommu *iommu,
+                          struct dma_ops_domain *dma_dom,
+                          bool populate, gfp_t gfp)
+{
+       int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
+       int i;
+
+#ifdef CONFIG_IOMMU_STRESS
+       populate = false;
+#endif
+
+       if (index >= APERTURE_MAX_RANGES)
+               return -ENOMEM;
+
+       dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
+       if (!dma_dom->aperture[index])
+               return -ENOMEM;
+
+       dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
+       if (!dma_dom->aperture[index]->bitmap)
+               goto out_free;
+
+       dma_dom->aperture[index]->offset = dma_dom->aperture_size;
+
+       if (populate) {
+               unsigned long address = dma_dom->aperture_size;
+               int i, num_ptes = APERTURE_RANGE_PAGES / 512;
+               u64 *pte, *pte_page;
+
+               for (i = 0; i < num_ptes; ++i) {
+                       pte = alloc_pte(&dma_dom->domain, address,
+                                       &pte_page, gfp);
+                       if (!pte)
+                               goto out_free;
+
+                       dma_dom->aperture[index]->pte_pages[i] = pte_page;
+
+                       address += APERTURE_RANGE_SIZE / 64;
+               }
+       }
+
+       dma_dom->aperture_size += APERTURE_RANGE_SIZE;
+
+       /* Intialize the exclusion range if necessary */
+       if (iommu->exclusion_start &&
+           iommu->exclusion_start >= dma_dom->aperture[index]->offset &&
+           iommu->exclusion_start < dma_dom->aperture_size) {
+               unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
+               int pages = iommu_num_pages(iommu->exclusion_start,
+                                           iommu->exclusion_length,
+                                           PAGE_SIZE);
+               dma_ops_reserve_addresses(dma_dom, startpage, pages);
+       }
+
+       /*
+        * Check for areas already mapped as present in the new aperture
+        * range and mark those pages as reserved in the allocator. Such
+        * mappings may already exist as a result of requested unity
+        * mappings for devices.
+        */
+       for (i = dma_dom->aperture[index]->offset;
+            i < dma_dom->aperture_size;
+            i += PAGE_SIZE) {
+               u64 *pte = fetch_pte(&dma_dom->domain, i);
+               if (!pte || !IOMMU_PTE_PRESENT(*pte))
+                       continue;
+
+               dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
+       }
+
+       return 0;
+
+out_free:
+       free_page((unsigned long)dma_dom->aperture[index]->bitmap);
+
+       kfree(dma_dom->aperture[index]);
+       dma_dom->aperture[index] = NULL;
+
+       return -ENOMEM;
+}
+
+static unsigned long dma_ops_area_alloc(struct device *dev,
+                                       struct dma_ops_domain *dom,
+                                       unsigned int pages,
+                                       unsigned long align_mask,
+                                       u64 dma_mask,
+                                       unsigned long start)
+{
+       unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
+       int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
+       int i = start >> APERTURE_RANGE_SHIFT;
+       unsigned long boundary_size;
+       unsigned long address = -1;
+       unsigned long limit;
+
+       next_bit >>= PAGE_SHIFT;
+
+       boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+                       PAGE_SIZE) >> PAGE_SHIFT;
+
+       for (;i < max_index; ++i) {
+               unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
+
+               if (dom->aperture[i]->offset >= dma_mask)
+                       break;
+
+               limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
+                                              dma_mask >> PAGE_SHIFT);
+
+               address = iommu_area_alloc(dom->aperture[i]->bitmap,
+                                          limit, next_bit, pages, 0,
+                                           boundary_size, align_mask);
+               if (address != -1) {
+                       address = dom->aperture[i]->offset +
+                                 (address << PAGE_SHIFT);
+                       dom->next_address = address + (pages << PAGE_SHIFT);
+                       break;
+               }
+
+               next_bit = 0;
+       }
+
+       return address;
+}
+
  static unsigned long dma_ops_alloc_addresses(struct device *dev,
                                              struct dma_ops_domain *dom,
                                              unsigned int pages,
                                              unsigned long align_mask,
                                              u64 dma_mask)
  {
-       unsigned long limit;
         unsigned long address;
-       unsigned long boundary_size;
  
-       boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
-                       PAGE_SIZE) >> PAGE_SHIFT;
-       limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
-                                      dma_mask >> PAGE_SHIFT);
+#ifdef CONFIG_IOMMU_STRESS
+       dom->next_address = 0;
+       dom->need_flush = true;
+#endif
  
-       if (dom->next_bit >= limit) {
-               dom->next_bit = 0;
-               dom->need_flush = true;
-       }
+       address = dma_ops_area_alloc(dev, dom, pages, align_mask,
+                                    dma_mask, dom->next_address);
  
-       address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
-                                  0 , boundary_size, align_mask);
         if (address == -1) {
-               address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
-                               0, boundary_size, align_mask);
+               dom->next_address = 0;
+               address = dma_ops_area_alloc(dev, dom, pages, align_mask,
+                                            dma_mask, 0);
                 dom->need_flush = true;
         }
  
-       if (likely(address != -1)) {
-               dom->next_bit = address + pages;
-               address <<= PAGE_SHIFT;
-       } else
+       if (unlikely(address == -1))
                 address = bad_dma_address;
  
         WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
@@ -684,11 +852,23 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
                                    unsigned long address,
                                    unsigned int pages)
  {
-       address >>= PAGE_SHIFT;
-       iommu_area_free(dom->bitmap, address, pages);
+       unsigned i = address >> APERTURE_RANGE_SHIFT;
+       struct aperture_range *range = dom->aperture[i];
  
-       if (address >= dom->next_bit)
+       BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
+
+#ifdef CONFIG_IOMMU_STRESS
+       if (i < 4)
+               return;
+#endif
+
+       if (address >= dom->next_address)
                 dom->need_flush = true;
+
+       address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
+
+       iommu_area_free(range->bitmap, address, pages);
+
  }
  
  /****************************************************************************
@@ -736,12 +916,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
                                       unsigned long start_page,
                                       unsigned int pages)
  {
-       unsigned int last_page = dom->aperture_size >> PAGE_SHIFT;
+       unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
  
         if (start_page + pages > last_page)
                 pages = last_page - start_page;
  
-       iommu_area_reserve(dom->bitmap, start_page, pages);
+       for (i = start_page; i < start_page + pages; ++i) {
+               int index = i / APERTURE_RANGE_PAGES;
+               int page  = i % APERTURE_RANGE_PAGES;
+               __set_bit(page, dom->aperture[index]->bitmap);
+       }
  }
  
  static void free_pagetable(struct protection_domain *domain)
@@ -780,14 +964,19 @@ static void free_pagetable(struct protection_domain *domain)
   */
  static void dma_ops_domain_free(struct dma_ops_domain *dom)
  {
+       int i;
+
         if (!dom)
                 return;
  
         free_pagetable(&dom->domain);
  
-       kfree(dom->pte_pages);
-
-       kfree(dom->bitmap);
+       for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
+               if (!dom->aperture[i])
+                       continue;
+               free_page((unsigned long)dom->aperture[i]->bitmap);
+               kfree(dom->aperture[i]);
+       }
  
         kfree(dom);
  }
@@ -797,19 +986,9 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
   * It also intializes the page table and the address allocator data
   * structures required for the dma_ops interface
   */
-static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
-                                                  unsigned order)
+static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
  {
         struct dma_ops_domain *dma_dom;
-       unsigned i, num_pte_pages;
-       u64 *l2_pde;
-       u64 address;
-
-       /*
-        * Currently the DMA aperture must be between 32 MB and 1GB in size
-        */
-       if ((order < 25) || (order > 30))
-               return NULL;
  
         dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
         if (!dma_dom)
@@ -826,55 +1005,20 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
         dma_dom->domain.priv = dma_dom;
         if (!dma_dom->domain.pt_root)
                 goto free_dma_dom;
-       dma_dom->aperture_size = (1ULL << order);
-       dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
-                                 GFP_KERNEL);
-       if (!dma_dom->bitmap)
-               goto free_dma_dom;
-       /*
-        * mark the first page as allocated so we never return 0 as
-        * a valid dma-address. So we can use 0 as error value
-        */
-       dma_dom->bitmap[0] = 1;
-       dma_dom->next_bit = 0;
  
         dma_dom->need_flush = false;
         dma_dom->target_dev = 0xffff;
  
-       /* Intialize the exclusion range if necessary */
-       if (iommu->exclusion_start &&
-           iommu->exclusion_start < dma_dom->aperture_size) {
-               unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
-               int pages = iommu_num_pages(iommu->exclusion_start,
-                                           iommu->exclusion_length,
-                                           PAGE_SIZE);
-               dma_ops_reserve_addresses(dma_dom, startpage, pages);
-       }
+       if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL))
+               goto free_dma_dom;
  
         /*
-        * At the last step, build the page tables so we don't need to
-        * allocate page table pages in the dma_ops mapping/unmapping
-        * path.
+        * mark the first page as allocated so we never return 0 as
+        * a valid dma-address. So we can use 0 as error value
          */
-       num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
-       dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
-                       GFP_KERNEL);
-       if (!dma_dom->pte_pages)
-               goto free_dma_dom;
-
-       l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
-       if (l2_pde == NULL)
-               goto free_dma_dom;
+       dma_dom->aperture[0]->bitmap[0] = 1;
+       dma_dom->next_address = 0;
  
-       dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
-
-       for (i = 0; i < num_pte_pages; ++i) {
-               dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
-               if (!dma_dom->pte_pages[i])
-                       goto free_dma_dom;
-               address = virt_to_phys(dma_dom->pte_pages[i]);
-               l2_pde[i] = IOMMU_L1_PDE(address);
-       }
  
         return dma_dom;
  
@@ -983,7 +1127,6 @@ static int device_change_notifier(struct notifier_block *nb,
         struct protection_domain *domain;
         struct dma_ops_domain *dma_domain;
         struct amd_iommu *iommu;
-       int order = amd_iommu_aperture_order;
         unsigned long flags;
  
         if (devid > amd_iommu_last_bdf)
@@ -1002,17 +1145,7 @@ static int device_change_notifier(struct notifier_block *nb,
                           "to a non-dma-ops domain\n", dev_name(dev));
  
         switch (action) {
-       case BUS_NOTIFY_BOUND_DRIVER:
-               if (domain)
-                       goto out;
-               dma_domain = find_protection_domain(devid);
-               if (!dma_domain)
-                       dma_domain = iommu->default_dom;
-               attach_device(iommu, &dma_domain->domain, devid);
-               printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
-                      "device %s\n", dma_domain->domain.id, dev_name(dev));
-               break;
-       case BUS_NOTIFY_UNBIND_DRIVER:
+       case BUS_NOTIFY_UNBOUND_DRIVER:
                 if (!domain)
                         goto out;
                 detach_device(domain, devid);
@@ -1022,7 +1155,7 @@ static int device_change_notifier(struct notifier_block *nb,
                 dma_domain = find_protection_domain(devid);
                 if (dma_domain)
                         goto out;
-               dma_domain = dma_ops_domain_alloc(iommu, order);
+               dma_domain = dma_ops_domain_alloc(iommu);
                 if (!dma_domain)
                         goto out;
                 dma_domain->target_dev = devid;
@@ -1133,8 +1266,8 @@ static int get_device_resources(struct device *dev,
                         dma_dom = (*iommu)->default_dom;
                 *domain = &dma_dom->domain;
                 attach_device(*iommu, *domain, *bdf);
-               printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
-                               "device %s\n", (*domain)->id, dev_name(dev));
+               DUMP_printk("Using protection domain %d for device %s\n",
+                           (*domain)->id, dev_name(dev));
         }
  
         if (domain_for_device(_bdf) == NULL)
@@ -1143,6 +1276,66 @@ static int get_device_resources(struct device *dev,
         return 1;
  }
  
+/*
+ * If the pte_page is not yet allocated this function is called
+ */
+static u64* alloc_pte(struct protection_domain *dom,
+                     unsigned long address, u64 **pte_page, gfp_t gfp)
+{
+       u64 *pte, *page;
+
+       pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)];
+
+       if (!IOMMU_PTE_PRESENT(*pte)) {
+               page = (u64 *)get_zeroed_page(gfp);
+               if (!page)
+                       return NULL;
+               *pte = IOMMU_L2_PDE(virt_to_phys(page));
+       }
+
+       pte = IOMMU_PTE_PAGE(*pte);
+       pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+
+       if (!IOMMU_PTE_PRESENT(*pte)) {
+               page = (u64 *)get_zeroed_page(gfp);
+               if (!page)
+                       return NULL;
+               *pte = IOMMU_L1_PDE(virt_to_phys(page));
+       }
+
+       pte = IOMMU_PTE_PAGE(*pte);
+
+       if (pte_page)
+               *pte_page = pte;
+
+       pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+
+       return pte;
+}
+
+/*
+ * This function fetches the PTE for a given address in the aperture
+ */
+static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
+                           unsigned long address)
+{
+       struct aperture_range *aperture;
+       u64 *pte, *pte_page;
+
+       aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
+       if (!aperture)
+               return NULL;
+
+       pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
+       if (!pte) {
+               pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
+               aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
+       } else
+               pte += IOMMU_PTE_L0_INDEX(address);
+
+       return pte;
+}
+
  /*
   * This is the generic map function. It maps one 4kb page at paddr to
   * the given address in the DMA address space for the domain.
@@ -1159,8 +1352,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
  
         paddr &= PAGE_MASK;
  
-       pte  = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
-       pte += IOMMU_PTE_L0_INDEX(address);
+       pte  = dma_ops_get_pte(dom, address);
+       if (!pte)
+               return bad_dma_address;
  
         __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
  
@@ -1185,14 +1379,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
                                  struct dma_ops_domain *dom,
                                  unsigned long address)
  {
+       struct aperture_range *aperture;
         u64 *pte;
  
         if (address >= dom->aperture_size)
                 return;
  
-       WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size);
+       aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
+       if (!aperture)
+               return;
+
+       pte  = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
+       if (!pte)
+               return;
  
-       pte  = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
         pte += IOMMU_PTE_L0_INDEX(address);
  
         WARN_ON(!*pte);
@@ -1216,7 +1416,7 @@ static dma_addr_t __map_single(struct device *dev,
                                u64 dma_mask)
  {
         dma_addr_t offset = paddr & ~PAGE_MASK;
-       dma_addr_t address, start;
+       dma_addr_t address, start, ret;
         unsigned int pages;
         unsigned long align_mask = 0;
         int i;
@@ -1232,14 +1432,33 @@ static dma_addr_t __map_single(struct device *dev,
         if (align)
                 align_mask = (1UL << get_order(size)) - 1;
  
+retry:
         address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
                                           dma_mask);
-       if (unlikely(address == bad_dma_address))
-               goto out;
+       if (unlikely(address == bad_dma_address)) {
+               /*
+                * setting next_address here will let the address
+                * allocator only scan the new allocated range in the
+                * first run. This is a small optimization.
+                */
+               dma_dom->next_address = dma_dom->aperture_size;
+
+               if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC))
+                       goto out;
+
+               /*
+                * aperture was sucessfully enlarged by 128 MB, try
+                * allocation again
+                */
+               goto retry;
+       }
  
         start = address;
         for (i = 0; i < pages; ++i) {
-               dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+               ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+               if (ret == bad_dma_address)
+                       goto out_unmap;
+
                 paddr += PAGE_SIZE;
                 start += PAGE_SIZE;
         }
@@ -1255,6 +1474,17 @@ static dma_addr_t __map_single(struct device *dev,
  
  out:
         return address;
+
+out_unmap:
+
+       for (--i; i >= 0; --i) {
+               start -= PAGE_SIZE;
+               dma_ops_domain_unmap(iommu, dma_dom, start);
+       }
+
+       dma_ops_free_addresses(dma_dom, address, pages);
+
+       return bad_dma_address;
  }
  
  /*
@@ -1537,8 +1767,10 @@ static void *alloc_coherent(struct device *dev, size_t size,
         *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
                                  size, DMA_BIDIRECTIONAL, true, dma_mask);
  
-       if (*dma_addr == bad_dma_address)
+       if (*dma_addr == bad_dma_address) {
+               spin_unlock_irqrestore(&domain->lock, flags);
                 goto out_free;
+       }
  
         iommu_completion_wait(iommu);
  
@@ -1625,7 +1857,6 @@ static void prealloc_protection_domains(void)
         struct pci_dev *dev = NULL;
         struct dma_ops_domain *dma_dom;
         struct amd_iommu *iommu;
-       int order = amd_iommu_aperture_order;
         u16 devid;
  
         while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
@@ -1638,7 +1869,7 @@ static void prealloc_protection_domains(void)
                 iommu = amd_iommu_rlookup_table[devid];
                 if (!iommu)
                         continue;
-               dma_dom = dma_ops_domain_alloc(iommu, order);
+               dma_dom = dma_ops_domain_alloc(iommu);
                 if (!dma_dom)
                         continue;
                 init_unity_mappings_for_device(dma_dom, devid);
@@ -1664,7 +1895,6 @@ static struct dma_map_ops amd_iommu_dma_ops = {
  int __init amd_iommu_init_dma_ops(void)
  {
         struct amd_iommu *iommu;
-       int order = amd_iommu_aperture_order;
         int ret;
  
         /*
@@ -1672,8 +1902,8 @@ int __init amd_iommu_init_dma_ops(void)
          * found in the system. Devices not assigned to any other
          * protection domain will be assigned to the default one.
          */
-       list_for_each_entry(iommu, &amd_iommu_list, list) {
-               iommu->default_dom = dma_ops_domain_alloc(iommu, order);
+       for_each_iommu(iommu) {
+               iommu->default_dom = dma_ops_domain_alloc(iommu);
                 if (iommu->default_dom == NULL)
                         return -ENOMEM;
                 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -1710,7 +1940,7 @@ int __init amd_iommu_init_dma_ops(void)
  
  free_domains:
  
-       list_for_each_entry(iommu, &amd_iommu_list, list) {
+       for_each_iommu(iommu) {
                 if (iommu->default_dom)
                         dma_ops_domain_free(iommu->default_dom);
         }
@@ -1842,7 +2072,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
  
         old_domain = domain_for_device(devid);
         if (old_domain)
-               return -EBUSY;
+               detach_device(old_domain, devid);
  
         attach_device(iommu, domain, devid);
  
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c

index 8c0be0902dacb2cc4e766cbe851891bd889f08e7..238989ec077df9e0b669c2f2a65d8b820a833510 100644 (file)
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -115,15 +115,21 @@ struct ivmd_header {
         u64 range_length;
  } __attribute__((packed));
  
+bool amd_iommu_dump;
+
  static int __initdata amd_iommu_detected;
  
  u16 amd_iommu_last_bdf;                        /* largest PCI device id we have
                                            to handle */
  LIST_HEAD(amd_iommu_unity_map);                /* a list of required unity mappings
                                            we find in ACPI */
-unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
+#ifdef CONFIG_IOMMU_STRESS
+bool amd_iommu_isolate = false;
+#else
  bool amd_iommu_isolate = true;         /* if true, device isolation is
                                            enabled */
+#endif
+
  bool amd_iommu_unmap_flush;            /* if true, flush on every unmap */
  
  LIST_HEAD(amd_iommu_list);             /* list of all AMD IOMMUs in the
@@ -175,7 +181,7 @@ static inline void update_last_devid(u16 devid)
  static inline unsigned long tbl_size(int entry_size)
  {
         unsigned shift = PAGE_SHIFT +
-                        get_order(amd_iommu_last_bdf * entry_size);
+                        get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
  
         return 1UL << shift;
  }
@@ -193,7 +199,7 @@ static inline unsigned long tbl_size(int entry_size)
   * This function set the exclusion range in the IOMMU. DMA accesses to the
   * exclusion range are passed through untranslated
   */
-static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
+static void iommu_set_exclusion_range(struct amd_iommu *iommu)
  {
         u64 start = iommu->exclusion_start & PAGE_MASK;
         u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
@@ -225,7 +231,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
  }
  
  /* Generic functions to enable/disable certain features of the IOMMU. */
-static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
+static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
  {
         u32 ctrl;
  
@@ -244,7 +250,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
  }
  
  /* Function to enable the hardware */
-static void __init iommu_enable(struct amd_iommu *iommu)
+static void iommu_enable(struct amd_iommu *iommu)
  {
         printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
                dev_name(&iommu->dev->dev), iommu->cap_ptr);
@@ -252,11 +258,9 @@ static void __init iommu_enable(struct amd_iommu *iommu)
         iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
  }
  
-/* Function to enable IOMMU event logging and event interrupts */
-static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
+static void iommu_disable(struct amd_iommu *iommu)
  {
-       iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
-       iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+       iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
  }
  
  /*
@@ -413,25 +417,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
  {
         u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
                         get_order(CMD_BUFFER_SIZE));
-       u64 entry;
  
         if (cmd_buf == NULL)
                 return NULL;
  
         iommu->cmd_buf_size = CMD_BUFFER_SIZE;
  
-       entry = (u64)virt_to_phys(cmd_buf);
+       return cmd_buf;
+}
+
+/*
+ * This function writes the command buffer address to the hardware and
+ * enables it.
+ */
+static void iommu_enable_command_buffer(struct amd_iommu *iommu)
+{
+       u64 entry;
+
+       BUG_ON(iommu->cmd_buf == NULL);
+
+       entry = (u64)virt_to_phys(iommu->cmd_buf);
         entry |= MMIO_CMD_SIZE_512;
+
         memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
-                       &entry, sizeof(entry));
+                   &entry, sizeof(entry));
  
         /* set head and tail to zero manually */
         writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
         writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
  
         iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
-
-       return cmd_buf;
  }
  
  static void __init free_command_buffer(struct amd_iommu *iommu)
@@ -443,20 +458,27 @@ static void __init free_command_buffer(struct amd_iommu *iommu)
  /* allocates the memory where the IOMMU will log its events to */
  static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
  {
-       u64 entry;
         iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
                                                 get_order(EVT_BUFFER_SIZE));
  
         if (iommu->evt_buf == NULL)
                 return NULL;
  
+       return iommu->evt_buf;
+}
+
+static void iommu_enable_event_buffer(struct amd_iommu *iommu)
+{
+       u64 entry;
+
+       BUG_ON(iommu->evt_buf == NULL);
+
         entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
+
         memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
                     &entry, sizeof(entry));
  
-       iommu->evt_buf_size = EVT_BUFFER_SIZE;
-
-       return iommu->evt_buf;
+       iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
  }
  
  static void __init free_event_buffer(struct amd_iommu *iommu)
@@ -596,32 +618,83 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
         p += sizeof(struct ivhd_header);
         end += h->length;
  
+
         while (p < end) {
                 e = (struct ivhd_entry *)p;
                 switch (e->type) {
                 case IVHD_DEV_ALL:
+
+                       DUMP_printk("  DEV_ALL\t\t\t first devid: %02x:%02x.%x"
+                                   " last device %02x:%02x.%x flags: %02x\n",
+                                   PCI_BUS(iommu->first_device),
+                                   PCI_SLOT(iommu->first_device),
+                                   PCI_FUNC(iommu->first_device),
+                                   PCI_BUS(iommu->last_device),
+                                   PCI_SLOT(iommu->last_device),
+                                   PCI_FUNC(iommu->last_device),
+                                   e->flags);
+
                         for (dev_i = iommu->first_device;
                                         dev_i <= iommu->last_device; ++dev_i)
                                 set_dev_entry_from_acpi(iommu, dev_i,
                                                         e->flags, 0);
                         break;
                 case IVHD_DEV_SELECT:
+
+                       DUMP_printk("  DEV_SELECT\t\t\t devid: %02x:%02x.%x "
+                                   "flags: %02x\n",
+                                   PCI_BUS(e->devid),
+                                   PCI_SLOT(e->devid),
+                                   PCI_FUNC(e->devid),
+                                   e->flags);
+
                         devid = e->devid;
                         set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
                         break;
                 case IVHD_DEV_SELECT_RANGE_START:
+
+                       DUMP_printk("  DEV_SELECT_RANGE_START\t "
+                                   "devid: %02x:%02x.%x flags: %02x\n",
+                                   PCI_BUS(e->devid),
+                                   PCI_SLOT(e->devid),
+                                   PCI_FUNC(e->devid),
+                                   e->flags);
+
                         devid_start = e->devid;
                         flags = e->flags;
                         ext_flags = 0;
                         alias = false;
                         break;
                 case IVHD_DEV_ALIAS:
+
+                       DUMP_printk("  DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
+                                   "flags: %02x devid_to: %02x:%02x.%x\n",
+                                   PCI_BUS(e->devid),
+                                   PCI_SLOT(e->devid),
+                                   PCI_FUNC(e->devid),
+                                   e->flags,
+                                   PCI_BUS(e->ext >> 8),
+                                   PCI_SLOT(e->ext >> 8),
+                                   PCI_FUNC(e->ext >> 8));
+
                         devid = e->devid;
                         devid_to = e->ext >> 8;
-                       set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
+                       set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
                         amd_iommu_alias_table[devid] = devid_to;
                         break;
                 case IVHD_DEV_ALIAS_RANGE:
+
+                       DUMP_printk("  DEV_ALIAS_RANGE\t\t "
+                                   "devid: %02x:%02x.%x flags: %02x "
+                                   "devid_to: %02x:%02x.%x\n",
+                                   PCI_BUS(e->devid),
+                                   PCI_SLOT(e->devid),
+                                   PCI_FUNC(e->devid),
+                                   e->flags,
+                                   PCI_BUS(e->ext >> 8),
+                                   PCI_SLOT(e->ext >> 8),
+                                   PCI_FUNC(e->ext >> 8));
+
                         devid_start = e->devid;
                         flags = e->flags;
                         devid_to = e->ext >> 8;
@@ -629,17 +702,39 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
                         alias = true;
                         break;
                 case IVHD_DEV_EXT_SELECT:
+
+                       DUMP_printk("  DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
+                                   "flags: %02x ext: %08x\n",
+                                   PCI_BUS(e->devid),
+                                   PCI_SLOT(e->devid),
+                                   PCI_FUNC(e->devid),
+                                   e->flags, e->ext);
+
                         devid = e->devid;
                         set_dev_entry_from_acpi(iommu, devid, e->flags,
                                                 e->ext);
                         break;
                 case IVHD_DEV_EXT_SELECT_RANGE:
+
+                       DUMP_printk("  DEV_EXT_SELECT_RANGE\t devid: "
+                                   "%02x:%02x.%x flags: %02x ext: %08x\n",
+                                   PCI_BUS(e->devid),
+                                   PCI_SLOT(e->devid),
+                                   PCI_FUNC(e->devid),
+                                   e->flags, e->ext);
+
                         devid_start = e->devid;
                         flags = e->flags;
                         ext_flags = e->ext;
                         alias = false;
                         break;
                 case IVHD_DEV_RANGE_END:
+
+                       DUMP_printk("  DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
+                                   PCI_BUS(e->devid),
+                                   PCI_SLOT(e->devid),
+                                   PCI_FUNC(e->devid));
+
                         devid = e->devid;
                         for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
                                 if (alias)
@@ -679,7 +774,7 @@ static void __init free_iommu_all(void)
  {
         struct amd_iommu *iommu, *next;
  
-       list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) {
+       for_each_iommu_safe(iommu, next) {
                 list_del(&iommu->list);
                 free_iommu_one(iommu);
                 kfree(iommu);
@@ -710,7 +805,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
         if (!iommu->mmio_base)
                 return -ENOMEM;
  
-       iommu_set_device_table(iommu);
         iommu->cmd_buf = alloc_command_buffer(iommu);
         if (!iommu->cmd_buf)
                 return -ENOMEM;
@@ -746,6 +840,15 @@ static int __init init_iommu_all(struct acpi_table_header *table)
                 h = (struct ivhd_header *)p;
                 switch (*p) {
                 case ACPI_IVHD_TYPE:
+
+                       DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x "
+                                   "seg: %d flags: %01x info %04x\n",
+                                   PCI_BUS(h->devid), PCI_SLOT(h->devid),
+                                   PCI_FUNC(h->devid), h->cap_ptr,
+                                   h->pci_seg, h->flags, h->info);
+                       DUMP_printk("       mmio-addr: %016llx\n",
+                                   h->mmio_phys);
+
                         iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
                         if (iommu == NULL)
                                 return -ENOMEM;
@@ -773,56 +876,9 @@ static int __init init_iommu_all(struct acpi_table_header *table)
   *
   ****************************************************************************/
  
-static int __init iommu_setup_msix(struct amd_iommu *iommu)
-{
-       struct amd_iommu *curr;
-       struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
-       int nvec = 0, i;
-
-       list_for_each_entry(curr, &amd_iommu_list, list) {
-               if (curr->dev == iommu->dev) {
-                       entries[nvec].entry = curr->evt_msi_num;
-                       entries[nvec].vector = 0;
-                       curr->int_enabled = true;
-                       nvec++;
-               }
-       }
-
-       if (pci_enable_msix(iommu->dev, entries, nvec)) {
-               pci_disable_msix(iommu->dev);
-               return 1;
-       }
-
-       for (i = 0; i < nvec; ++i) {
-               int r = request_irq(entries->vector, amd_iommu_int_handler,
-                                   IRQF_SAMPLE_RANDOM,
-                                   "AMD IOMMU",
-                                   NULL);
-               if (r)
-                       goto out_free;
-       }
-
-       return 0;
-
-out_free:
-       for (i -= 1; i >= 0; --i)
-               free_irq(entries->vector, NULL);
-
-       pci_disable_msix(iommu->dev);
-
-       return 1;
-}
-
  static int __init iommu_setup_msi(struct amd_iommu *iommu)
  {
         int r;
-       struct amd_iommu *curr;
-
-       list_for_each_entry(curr, &amd_iommu_list, list) {
-               if (curr->dev == iommu->dev)
-                       curr->int_enabled = true;
-       }
-
  
         if (pci_enable_msi(iommu->dev))
                 return 1;
@@ -837,17 +893,18 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
                 return 1;
         }
  
+       iommu->int_enabled = true;
+       iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+
         return 0;
  }
  
-static int __init iommu_init_msi(struct amd_iommu *iommu)
+static int iommu_init_msi(struct amd_iommu *iommu)
  {
         if (iommu->int_enabled)
                 return 0;
  
-       if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX))
-               return iommu_setup_msix(iommu);
-       else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
+       if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
                 return iommu_setup_msi(iommu);
  
         return 1;
@@ -899,6 +956,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
  static int __init init_unity_map_range(struct ivmd_header *m)
  {
         struct unity_map_entry *e = 0;
+       char *s;
  
         e = kzalloc(sizeof(*e), GFP_KERNEL);
         if (e == NULL)
@@ -906,14 +964,19 @@ static int __init init_unity_map_range(struct ivmd_header *m)
  
         switch (m->type) {
         default:
+               kfree(e);
+               return 0;
         case ACPI_IVMD_TYPE:
+               s = "IVMD_TYPEi\t\t\t";
                 e->devid_start = e->devid_end = m->devid;
                 break;
         case ACPI_IVMD_TYPE_ALL:
+               s = "IVMD_TYPE_ALL\t\t";
                 e->devid_start = 0;
                 e->devid_end = amd_iommu_last_bdf;
                 break;
         case ACPI_IVMD_TYPE_RANGE:
+               s = "IVMD_TYPE_RANGE\t\t";
                 e->devid_start = m->devid;
                 e->devid_end = m->aux;
                 break;
@@ -922,6 +985,13 @@ static int __init init_unity_map_range(struct ivmd_header *m)
         e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
         e->prot = m->flags >> 1;
  
+       DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
+                   " range_start: %016llx range_end: %016llx flags: %x\n", s,
+                   PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
+                   PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
+                   PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
+                   e->address_start, e->address_end, m->flags);
+
         list_add_tail(&e->list, &amd_iommu_unity_map);
  
         return 0;
@@ -967,18 +1037,28 @@ static void init_device_table(void)
   * This function finally enables all IOMMUs found in the system after
   * they have been initialized
   */
-static void __init enable_iommus(void)
+static void enable_iommus(void)
  {
         struct amd_iommu *iommu;
  
-       list_for_each_entry(iommu, &amd_iommu_list, list) {
+       for_each_iommu(iommu) {
+               iommu_set_device_table(iommu);
+               iommu_enable_command_buffer(iommu);
+               iommu_enable_event_buffer(iommu);
                 iommu_set_exclusion_range(iommu);
                 iommu_init_msi(iommu);
-               iommu_enable_event_logging(iommu);
                 iommu_enable(iommu);
         }
  }
  
+static void disable_iommus(void)
+{
+       struct amd_iommu *iommu;
+
+       for_each_iommu(iommu)
+               iommu_disable(iommu);
+}
+
  /*
   * Suspend/Resume support
   * disable suspend until real resume implemented
@@ -986,12 +1066,31 @@ static void __init enable_iommus(void)
  
  static int amd_iommu_resume(struct sys_device *dev)
  {
+       /*
+        * Disable IOMMUs before reprogramming the hardware registers.
+        * IOMMU is still enabled from the resume kernel.
+        */
+       disable_iommus();
+
+       /* re-load the hardware */
+       enable_iommus();
+
+       /*
+        * we have to flush after the IOMMUs are enabled because a
+        * disabled IOMMU will never execute the commands we send
+        */
+       amd_iommu_flush_all_domains();
+       amd_iommu_flush_all_devices();
+
         return 0;
  }
  
  static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
  {
-       return -EINVAL;
+       /* disable IOMMUs to go out of the way for BIOS */
+       disable_iommus();
+
+       return 0;
  }
  
  static struct sysdev_class amd_iommu_sysdev_class = {
@@ -1137,9 +1236,6 @@ int __init amd_iommu_init(void)
  
         enable_iommus();
  
-       printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
-                       (1 << (amd_iommu_aperture_order-20)));
-
         printk(KERN_INFO "AMD IOMMU: device isolation ");
         if (amd_iommu_isolate)
                 printk("enabled\n");
@@ -1211,6 +1307,13 @@ void __init amd_iommu_detect(void)
   *
   ****************************************************************************/
  
+static int __init parse_amd_iommu_dump(char *str)
+{
+       amd_iommu_dump = true;
+
+       return 1;
+}
+
  static int __init parse_amd_iommu_options(char *str)
  {
         for (; *str; ++str) {
@@ -1225,15 +1328,5 @@ static int __init parse_amd_iommu_options(char *str)
         return 1;
  }
  
-static int __init parse_amd_iommu_size_options(char *str)
-{
-       unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
-
-       if ((order > 24) && (order < 31))
-               amd_iommu_aperture_order = order;
-
-       return 1;
-}
-
+__setup("amd_iommu_dump", parse_amd_iommu_dump);
  __setup("amd_iommu=", parse_amd_iommu_options);
-__setup("amd_iommu_size=", parse_amd_iommu_size_options);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c

index f2870920f246a9f1d7e2075c37b65b05de27dbf1..076d3881f3da11d5e18f1bcabc079e1192f935c2 100644 (file)
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,6 +14,7 @@
   *     Mikael Pettersson       :       PM converted to driver model.
   */
  
+#include <linux/perf_counter.h>
  #include <linux/kernel_stat.h>
  #include <linux/mc146818rtc.h>
  #include <linux/acpi_pmtmr.h>
@@ -34,6 +35,7 @@
  #include <linux/smp.h>
  #include <linux/mm.h>
  
+#include <asm/perf_counter.h>
  #include <asm/pgalloc.h>
  #include <asm/atomic.h>
  #include <asm/mpspec.h>
@@ -98,6 +100,29 @@ early_param("lapic", parse_lapic);
  /* Local APIC was disabled by the BIOS and enabled by the kernel */
  static int enabled_via_apicbase;
  
+/*
+ * Handle interrupt mode configuration register (IMCR).
+ * This register controls whether the interrupt signals
+ * that reach the BSP come from the master PIC or from the
+ * local APIC. Before entering Symmetric I/O Mode, either
+ * the BIOS or the operating system must switch out of
+ * PIC Mode by changing the IMCR.
+ */
+static inline void imcr_pic_to_apic(void)
+{
+       /* select IMCR register */
+       outb(0x70, 0x22);
+       /* NMI and 8259 INTR go through APIC */
+       outb(0x01, 0x23);
+}
+
+static inline void imcr_apic_to_pic(void)
+{
+       /* select IMCR register */
+       outb(0x70, 0x22);
+       /* NMI and 8259 INTR go directly to BSP */
+       outb(0x00, 0x23);
+}
  #endif
  
  #ifdef CONFIG_X86_64
@@ -111,13 +136,19 @@ static __init int setup_apicpmtimer(char *s)
  __setup("apicpmtimer", setup_apicpmtimer);
  #endif
  
+int x2apic_mode;
  #ifdef CONFIG_X86_X2APIC
-int x2apic;
  /* x2apic enabled before OS handover */
  static int x2apic_preenabled;
  static int disable_x2apic;
  static __init int setup_nox2apic(char *str)
  {
+       if (x2apic_enabled()) {
+               pr_warning("Bios already enabled x2apic, "
+                          "can't enforce nox2apic");
+               return 0;
+       }
+
         disable_x2apic = 1;
         setup_clear_cpu_cap(X86_FEATURE_X2APIC);
         return 0;
@@ -209,6 +240,31 @@ static int modern_apic(void)
         return lapic_get_version() >= 0x14;
  }
  
+/*
+ * bare function to substitute write operation
+ * and it's _that_ fast :)
+ */
+static void native_apic_write_dummy(u32 reg, u32 v)
+{
+       WARN_ON_ONCE((cpu_has_apic || !disable_apic));
+}
+
+static u32 native_apic_read_dummy(u32 reg)
+{
+       WARN_ON_ONCE((cpu_has_apic && !disable_apic));
+       return 0;
+}
+
+/*
+ * right after this call apic->write/read doesn't do anything
+ * note that there is no restore operation it works one way
+ */
+void apic_disable(void)
+{
+       apic->read = native_apic_read_dummy;
+       apic->write = native_apic_write_dummy;
+}
+
  void native_apic_wait_icr_idle(void)
  {
         while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
@@ -348,7 +404,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
  
  static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
  {
-       unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
+       unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);
         unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
  
         apic_write(reg, v);
@@ -815,7 +871,7 @@ void clear_local_APIC(void)
         u32 v;
  
         /* APIC hasn't been mapped yet */
-       if (!x2apic && !apic_phys)
+       if (!x2apic_mode && !apic_phys)
                 return;
  
         maxlvt = lapic_get_maxlvt();
@@ -1133,6 +1189,7 @@ void __cpuinit setup_local_APIC(void)
                 apic_write(APIC_ESR, 0);
         }
  #endif
+       perf_counters_lapic_init();
  
         preempt_disable();
  
@@ -1287,7 +1344,7 @@ void check_x2apic(void)
  {
         if (x2apic_enabled()) {
                 pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
-               x2apic_preenabled = x2apic = 1;
+               x2apic_preenabled = x2apic_mode = 1;
         }
  }
  
@@ -1295,7 +1352,7 @@ void enable_x2apic(void)
  {
         int msr, msr2;
  
-       if (!x2apic)
+       if (!x2apic_mode)
                 return;
  
         rdmsr(MSR_IA32_APICBASE, msr, msr2);
@@ -1304,6 +1361,7 @@ void enable_x2apic(void)
                 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
         }
  }
+#endif /* CONFIG_X86_X2APIC */
  
  void __init enable_IR_x2apic(void)
  {
@@ -1312,32 +1370,21 @@ void __init enable_IR_x2apic(void)
         unsigned long flags;
         struct IO_APIC_route_entry **ioapic_entries = NULL;
  
-       if (!cpu_has_x2apic)
-               return;
-
-       if (!x2apic_preenabled && disable_x2apic) {
-               pr_info("Skipped enabling x2apic and Interrupt-remapping "
-                       "because of nox2apic\n");
-               return;
+       ret = dmar_table_init();
+       if (ret) {
+               pr_debug("dmar_table_init() failed with %d:\n", ret);
+               goto ir_failed;
         }
  
-       if (x2apic_preenabled && disable_x2apic)
-               panic("Bios already enabled x2apic, can't enforce nox2apic");
-
-       if (!x2apic_preenabled && skip_ioapic_setup) {
-               pr_info("Skipped enabling x2apic and Interrupt-remapping "
-                       "because of skipping io-apic setup\n");
-               return;
+       if (!intr_remapping_supported()) {
+               pr_debug("intr-remapping not supported\n");
+               goto ir_failed;
         }
  
-       ret = dmar_table_init();
-       if (ret) {
-               pr_info("dmar_table_init() failed with %d:\n", ret);
  
-               if (x2apic_preenabled)
-                       panic("x2apic enabled by bios. But IR enabling failed");
-               else
-                       pr_info("Not enabling x2apic,Intr-remapping\n");
+       if (!x2apic_preenabled && skip_ioapic_setup) {
+               pr_info("Skipped enabling intr-remap because of skipping "
+                       "io-apic setup\n");
                 return;
         }
  
@@ -1357,19 +1404,16 @@ void __init enable_IR_x2apic(void)
         mask_IO_APIC_setup(ioapic_entries);
         mask_8259A();
  
-       ret = enable_intr_remapping(EIM_32BIT_APIC_ID);
-
-       if (ret && x2apic_preenabled) {
-               local_irq_restore(flags);
-               panic("x2apic enabled by bios. But IR enabling failed");
-       }
-
+       ret = enable_intr_remapping(x2apic_supported());
         if (ret)
                 goto end_restore;
  
-       if (!x2apic) {
-               x2apic = 1;
+       pr_info("Enabled Interrupt-remapping\n");
+
+       if (x2apic_supported() && !x2apic_mode) {
+               x2apic_mode = 1;
                 enable_x2apic();
+               pr_info("Enabled x2apic\n");
         }
  
  end_restore:
@@ -1378,37 +1422,34 @@ end_restore:
                  * IR enabling failed
                  */
                 restore_IO_APIC_setup(ioapic_entries);
-       else
-               reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries);
  
         unmask_8259A();
         local_irq_restore(flags);
  
  end:
-       if (!ret) {
-               if (!x2apic_preenabled)
-                       pr_info("Enabled x2apic and interrupt-remapping\n");
-               else
-                       pr_info("Enabled Interrupt-remapping\n");
-       } else
-               pr_err("Failed to enable Interrupt-remapping and x2apic\n");
         if (ioapic_entries)
                 free_ioapic_entries(ioapic_entries);
+
+       if (!ret)
+               return;
+
+ir_failed:
+       if (x2apic_preenabled)
+               panic("x2apic enabled by bios. But IR enabling failed");
+       else if (cpu_has_x2apic)
+               pr_info("Not enabling x2apic,Intr-remapping\n");
  #else
         if (!cpu_has_x2apic)
                 return;
  
         if (x2apic_preenabled)
                 panic("x2apic enabled prior OS handover,"
-                     " enable CONFIG_INTR_REMAP");
-
-       pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping "
-               " and x2apic\n");
+                     " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
  #endif
  
         return;
  }
-#endif /* CONFIG_X86_X2APIC */
+
  
  #ifdef CONFIG_X86_64
  /*
@@ -1425,7 +1466,6 @@ static int __init detect_init_APIC(void)
         }
  
         mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-       boot_cpu_physical_apicid = 0;
         return 0;
  }
  #else
@@ -1539,32 +1579,49 @@ void __init early_init_lapic_mapping(void)
   */
  void __init init_apic_mappings(void)
  {
-       if (x2apic) {
+       unsigned int new_apicid;
+
+       if (x2apic_mode) {
                 boot_cpu_physical_apicid = read_apic_id();
                 return;
         }
  
-       /*
-        * If no local APIC can be found then set up a fake all
-        * zeroes page to simulate the local APIC and another
-        * one for the IO-APIC.
-        */
+       /* If no local APIC can be found return early */
         if (!smp_found_config && detect_init_APIC()) {
-               apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
-               apic_phys = __pa(apic_phys);
-       } else
+               /* lets NOP'ify apic operations */
+               pr_info("APIC: disable apic facility\n");
+               apic_disable();
+       } else {
                 apic_phys = mp_lapic_addr;
  
-       set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
-       apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
-                               APIC_BASE, apic_phys);
+               /*
+                * acpi lapic path already maps that address in
+                * acpi_register_lapic_address()
+                */
+               if (!acpi_lapic)
+                       set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+
+               apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
+                                       APIC_BASE, apic_phys);
+       }
  
         /*
          * Fetch the APIC ID of the BSP in case we have a
          * default configuration (or the MP table is broken).
          */
-       if (boot_cpu_physical_apicid == -1U)
-               boot_cpu_physical_apicid = read_apic_id();
+       new_apicid = read_apic_id();
+       if (boot_cpu_physical_apicid != new_apicid) {
+               boot_cpu_physical_apicid = new_apicid;
+               /*
+                * yeah -- we lie about apic_version
+                * in case if apic was disabled via boot option
+                * but it's not a problem for SMP compiled kernel
+                * since smp_sanity_check is prepared for such a case
+                * and disable smp mode
+                */
+               apic_version[new_apicid] =
+                        GET_APIC_VERSION(apic_read(APIC_LVR));
+       }
  }
  
  /*
@@ -1733,8 +1790,7 @@ void __init connect_bsp_APIC(void)
                  */
                 apic_printk(APIC_VERBOSE, "leaving PIC mode, "
                                 "enabling APIC mode.\n");
-               outb(0x70, 0x22);
-               outb(0x01, 0x23);
+               imcr_pic_to_apic();
         }
  #endif
         if (apic->enable_apic_mode)
@@ -1762,8 +1818,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
                  */
                 apic_printk(APIC_VERBOSE, "disabling APIC mode, "
                                 "entering PIC mode.\n");
-               outb(0x70, 0x22);
-               outb(0x00, 0x23);
+               imcr_apic_to_pic();
                 return;
         }
  #endif
@@ -1969,10 +2024,10 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
  
         local_irq_save(flags);
         disable_local_APIC();
-#ifdef CONFIG_INTR_REMAP
+
         if (intr_remapping_enabled)
                 disable_intr_remapping();
-#endif
+
         local_irq_restore(flags);
         return 0;
  }
@@ -1982,42 +2037,34 @@ static int lapic_resume(struct sys_device *dev)
         unsigned int l, h;
         unsigned long flags;
         int maxlvt;
-
-#ifdef CONFIG_INTR_REMAP
-       int ret;
+       int ret = 0;
         struct IO_APIC_route_entry **ioapic_entries = NULL;
  
         if (!apic_pm_state.active)
                 return 0;
  
         local_irq_save(flags);
-       if (x2apic) {
+       if (intr_remapping_enabled) {
                 ioapic_entries = alloc_ioapic_entries();
                 if (!ioapic_entries) {
                         WARN(1, "Alloc ioapic_entries in lapic resume failed.");
-                       return -ENOMEM;
+                       ret = -ENOMEM;
+                       goto restore;
                 }
  
                 ret = save_IO_APIC_setup(ioapic_entries);
                 if (ret) {
                         WARN(1, "Saving IO-APIC state failed: %d\n", ret);
                         free_ioapic_entries(ioapic_entries);
-                       return ret;
+                       goto restore;
                 }
  
                 mask_IO_APIC_setup(ioapic_entries);
                 mask_8259A();
-               enable_x2apic();
         }
-#else
-       if (!apic_pm_state.active)
-               return 0;
  
-       local_irq_save(flags);
-       if (x2apic)
+       if (x2apic_mode)
                 enable_x2apic();
-#endif
-
         else {
                 /*
                  * Make sure the APICBASE points to the right address
@@ -2055,21 +2102,16 @@ static int lapic_resume(struct sys_device *dev)
         apic_write(APIC_ESR, 0);
         apic_read(APIC_ESR);
  
-#ifdef CONFIG_INTR_REMAP
-       if (intr_remapping_enabled)
-               reenable_intr_remapping(EIM_32BIT_APIC_ID);
-
-       if (x2apic) {
+       if (intr_remapping_enabled) {
+               reenable_intr_remapping(x2apic_mode);
                 unmask_8259A();
                 restore_IO_APIC_setup(ioapic_entries);
                 free_ioapic_entries(ioapic_entries);
         }
-#endif
-
+restore:
         local_irq_restore(flags);
  
-
-       return 0;
+       return ret;
  }
  
  /*
@@ -2117,31 +2159,14 @@ static void apic_pm_activate(void) { }
  #endif /* CONFIG_PM */
  
  #ifdef CONFIG_X86_64
-/*
- * apic_is_clustered_box() -- Check if we can expect good TSC
- *
- * Thus far, the major user of this is IBM's Summit2 series:
- *
- * Clustered boxes may have unsynced TSC problems if they are
- * multi-chassis. Use available data to take a good guess.
- * If in doubt, go HPET.
- */
-__cpuinit int apic_is_clustered_box(void)
+
+static int __cpuinit apic_cluster_num(void)
  {
         int i, clusters, zeros;
         unsigned id;
         u16 *bios_cpu_apicid;
         DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
  
-       /*
-        * there is not this kind of box with AMD CPU yet.
-        * Some AMD box with quadcore cpu and 8 sockets apicid
-        * will be [4, 0x23] or [8, 0x27] could be thought to
-        * vsmp box still need checking...
-        */
-       if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
-               return 0;
-
         bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
         bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
  
@@ -2177,18 +2202,67 @@ __cpuinit int apic_is_clustered_box(void)
                         ++zeros;
         }
  
-       /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
-        * not guaranteed to be synced between boards
-        */
-       if (is_vsmp_box() && clusters > 1)
+       return clusters;
+}
+
+static int __cpuinitdata multi_checked;
+static int __cpuinitdata multi;
+
+static int __cpuinit set_multi(const struct dmi_system_id *d)
+{
+       if (multi)
+               return 0;
+       pr_info("APIC: %s detected, Multi Chassis\n", d->ident);
+       multi = 1;
+       return 0;
+}
+
+static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = {
+       {
+               .callback = set_multi,
+               .ident = "IBM System Summit2",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"),
+               },
+       },
+       {}
+};
+
+static void __cpuinit dmi_check_multi(void)
+{
+       if (multi_checked)
+               return;
+
+       dmi_check_system(multi_dmi_table);
+       multi_checked = 1;
+}
+
+/*
+ * apic_is_clustered_box() -- Check if we can expect good TSC
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis.
+ * Use DMI to check them
+ */
+__cpuinit int apic_is_clustered_box(void)
+{
+       dmi_check_multi();
+       if (multi)
                 return 1;
  
+       if (!is_vsmp_box())
+               return 0;
+
         /*
-        * If clusters > 2, then should be multi-chassis.
-        * May have to revisit this when multi-core + hyperthreaded CPUs come
-        * out, but AFAIK this will work even for them.
+        * ScaleMP vSMPowered boxes have one cluster per board and TSCs are
+        * not guaranteed to be synced between boards
          */
-       return (clusters > 2);
+       if (apic_cluster_num() > 1)
+               return 1;
+
+       return 0;
  }
  #endif
  
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c

index 306e5e88fb6f4abc9a53a3ce8638ac904a67067e..d0c99abc26c32a8f0df48d61485b1f36acb9b4b2 100644 (file)
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -161,7 +161,7 @@ static int flat_apic_id_registered(void)
  
  static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
  {
-       return hard_smp_processor_id() >> index_msb;
+       return initial_apic_id >> index_msb;
  }
  
  struct apic apic_flat =  {
@@ -235,7 +235,7 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
          * regardless of how many processors are present (x86_64 ES7000
          * is an example).
          */
-       if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
+       if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
                 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
                 printk(KERN_DEBUG "system APIC only can use physical flat");
                 return 1;
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c

index 302947775575ebb69a670a4ef10f0184b2b82ac9..69328ac8de9c86e106d30af1337a80ab0d4488b1 100644 (file)
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -145,7 +145,7 @@ es7000_rename_gsi(int ioapic, int gsi)
         return gsi;
  }
  
-static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
+static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
  {
         unsigned long vect = 0, psaival = 0;
  
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c

index 30da617d18e4735de234904fdc005cf527930c89..1946fac42ab3cb8a406d46509a983163cd347008 100644 (file)
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -59,6 +59,7 @@
  #include <asm/setup.h>
  #include <asm/irq_remapping.h>
  #include <asm/hpet.h>
+#include <asm/hw_irq.h>
  #include <asm/uv/uv_hub.h>
  #include <asm/uv/uv_irq.h>
  
@@ -129,12 +130,9 @@ struct irq_pin_list {
         struct irq_pin_list *next;
  };
  
-static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+static struct irq_pin_list *get_one_free_irq_2_pin(int node)
  {
         struct irq_pin_list *pin;
-       int node;
-
-       node = cpu_to_node(cpu);
  
         pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
  
@@ -148,9 +146,6 @@ struct irq_cfg {
         unsigned move_cleanup_count;
         u8 vector;
         u8 move_in_progress : 1;
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-       u8 move_desc_pending : 1;
-#endif
  };
  
  /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -212,12 +207,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
         return cfg;
  }
  
-static struct irq_cfg *get_one_free_irq_cfg(int cpu)
+static struct irq_cfg *get_one_free_irq_cfg(int node)
  {
         struct irq_cfg *cfg;
-       int node;
-
-       node = cpu_to_node(cpu);
  
         cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
         if (cfg) {
@@ -238,13 +230,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu)
         return cfg;
  }
  
-int arch_init_chip_data(struct irq_desc *desc, int cpu)
+int arch_init_chip_data(struct irq_desc *desc, int node)
  {
         struct irq_cfg *cfg;
  
         cfg = desc->chip_data;
         if (!cfg) {
-               desc->chip_data = get_one_free_irq_cfg(cpu);
+               desc->chip_data = get_one_free_irq_cfg(node);
                 if (!desc->chip_data) {
                         printk(KERN_ERR "can not alloc irq_cfg\n");
                         BUG_ON(1);
@@ -254,10 +246,9 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu)
         return 0;
  }
  
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-
+/* for move_irq_desc */
  static void
-init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
+init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
  {
         struct irq_pin_list *old_entry, *head, *tail, *entry;
  
@@ -266,7 +257,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
         if (!old_entry)
                 return;
  
-       entry = get_one_free_irq_2_pin(cpu);
+       entry = get_one_free_irq_2_pin(node);
         if (!entry)
                 return;
  
@@ -276,7 +267,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
         tail            = entry;
         old_entry       = old_entry->next;
         while (old_entry) {
-               entry = get_one_free_irq_2_pin(cpu);
+               entry = get_one_free_irq_2_pin(node);
                 if (!entry) {
                         entry = head;
                         while (entry) {
@@ -316,12 +307,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
  }
  
  void arch_init_copy_chip_data(struct irq_desc *old_desc,
-                                struct irq_desc *desc, int cpu)
+                                struct irq_desc *desc, int node)
  {
         struct irq_cfg *cfg;
         struct irq_cfg *old_cfg;
  
-       cfg = get_one_free_irq_cfg(cpu);
+       cfg = get_one_free_irq_cfg(node);
  
         if (!cfg)
                 return;
@@ -332,7 +323,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc,
  
         memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
  
-       init_copy_irq_2_pin(old_cfg, cfg, cpu);
+       init_copy_irq_2_pin(old_cfg, cfg, node);
  }
  
  static void free_irq_cfg(struct irq_cfg *old_cfg)
@@ -356,19 +347,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
                 old_desc->chip_data = NULL;
         }
  }
-
-static void
-set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
-       struct irq_cfg *cfg = desc->chip_data;
-
-       if (!cfg->move_in_progress) {
-               /* it means that domain is not changed */
-               if (!cpumask_intersects(desc->affinity, mask))
-                       cfg->move_desc_pending = 1;
-       }
-}
-#endif
+/* end for move_irq_desc */
  
  #else
  static struct irq_cfg *irq_cfg(unsigned int irq)
@@ -378,13 +357,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
  
  #endif
  
-#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
-static inline void
-set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
-}
-#endif
-
  struct io_apic {
         unsigned int index;
         unsigned int unused[3];
@@ -518,132 +490,18 @@ static void ioapic_mask_entry(int apic, int pin)
         spin_unlock_irqrestore(&ioapic_lock, flags);
  }
  
-#ifdef CONFIG_SMP
-static void send_cleanup_vector(struct irq_cfg *cfg)
-{
-       cpumask_var_t cleanup_mask;
-
-       if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
-               unsigned int i;
-               cfg->move_cleanup_count = 0;
-               for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
-                       cfg->move_cleanup_count++;
-               for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
-                       apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
-       } else {
-               cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
-               cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
-               apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-               free_cpumask_var(cleanup_mask);
-       }
-       cfg->move_in_progress = 0;
-}
-
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
-{
-       int apic, pin;
-       struct irq_pin_list *entry;
-       u8 vector = cfg->vector;
-
-       entry = cfg->irq_2_pin;
-       for (;;) {
-               unsigned int reg;
-
-               if (!entry)
-                       break;
-
-               apic = entry->apic;
-               pin = entry->pin;
-               /*
-                * With interrupt-remapping, destination information comes
-                * from interrupt-remapping table entry.
-                */
-               if (!irq_remapped(irq))
-                       io_apic_write(apic, 0x11 + pin*2, dest);
-               reg = io_apic_read(apic, 0x10 + pin*2);
-               reg &= ~IO_APIC_REDIR_VECTOR_MASK;
-               reg |= vector;
-               io_apic_modify(apic, 0x10 + pin*2, reg);
-               if (!entry->next)
-                       break;
-               entry = entry->next;
-       }
-}
-
-static int
-assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
-
-/*
- * Either sets desc->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
- * leaves desc->affinity untouched.
- */
-static unsigned int
-set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
-{
-       struct irq_cfg *cfg;
-       unsigned int irq;
-
-       if (!cpumask_intersects(mask, cpu_online_mask))
-               return BAD_APICID;
-
-       irq = desc->irq;
-       cfg = desc->chip_data;
-       if (assign_irq_vector(irq, cfg, mask))
-               return BAD_APICID;
-
-       /* check that before desc->addinity get updated */
-       set_extra_move_desc(desc, mask);
-
-       cpumask_copy(desc->affinity, mask);
-
-       return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
-}
-
-static void
-set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
-       struct irq_cfg *cfg;
-       unsigned long flags;
-       unsigned int dest;
-       unsigned int irq;
-
-       irq = desc->irq;
-       cfg = desc->chip_data;
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       dest = set_desc_affinity(desc, mask);
-       if (dest != BAD_APICID) {
-               /* Only the high 8 bits are valid. */
-               dest = SET_APIC_LOGICAL_ID(dest);
-               __target_IO_APIC_irq(irq, dest, cfg);
-       }
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-static void
-set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
-{
-       struct irq_desc *desc;
-
-       desc = irq_to_desc(irq);
-
-       set_ioapic_affinity_irq_desc(desc, mask);
-}
-#endif /* CONFIG_SMP */
-
  /*
   * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
   * shared ISA-space IRQs, so we have to support them. We are super
   * fast in the common case, and fast for shared ISA-space IRQs.
   */
-static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
+static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
  {
         struct irq_pin_list *entry;
  
         entry = cfg->irq_2_pin;
         if (!entry) {
-               entry = get_one_free_irq_2_pin(cpu);
+               entry = get_one_free_irq_2_pin(node);
                 if (!entry) {
                         printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
                                         apic, pin);
@@ -663,7 +521,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
                 entry = entry->next;
         }
  
-       entry->next = get_one_free_irq_2_pin(cpu);
+       entry->next = get_one_free_irq_2_pin(node);
         entry = entry->next;
         entry->apic = apic;
         entry->pin = pin;
@@ -672,7 +530,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
  /*
   * Reroute an IRQ to a different pin.
   */
-static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
+static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
                                       int oldapic, int oldpin,
                                       int newapic, int newpin)
  {
@@ -692,7 +550,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
  
         /* why? call replace before add? */
         if (!replaced)
-               add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
+               add_pin_to_irq_node(cfg, node, newapic, newpin);
  }
  
  static inline void io_apic_modify_irq(struct irq_cfg *cfg,
@@ -850,7 +708,6 @@ static int __init ioapic_pirq_setup(char *str)
  __setup("pirq=", ioapic_pirq_setup);
  #endif /* CONFIG_X86_32 */
  
-#ifdef CONFIG_INTR_REMAP
  struct IO_APIC_route_entry **alloc_ioapic_entries(void)
  {
         int apic;
@@ -948,20 +805,6 @@ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
         return 0;
  }
  
-void reinit_intr_remapped_IO_APIC(int intr_remapping,
-       struct IO_APIC_route_entry **ioapic_entries)
-
-{
-       /*
-        * for now plain restore of previous settings.
-        * TBD: In the case of OS enabling interrupt-remapping,
-        * IO-APIC RTE's need to be setup to point to interrupt-remapping
-        * table entries. for now, do a plain restore, and wait for
-        * the setup_IO_APIC_irqs() to do proper initialization.
-        */
-       restore_IO_APIC_setup(ioapic_entries);
-}
-
  void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
  {
         int apic;
@@ -971,7 +814,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
  
         kfree(ioapic_entries);
  }
-#endif
  
  /*
   * Find the IRQ entry number of a certain pin.
@@ -1032,54 +874,6 @@ static int __init find_isa_irq_apic(int irq, int type)
         return -1;
  }
  
-/*
- * Find a specific PCI IRQ entry.
- * Not an __init, possibly needed by modules
- */
-static int pin_2_irq(int idx, int apic, int pin);
-
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
-{
-       int apic, i, best_guess = -1;
-
-       apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
-               bus, slot, pin);
-       if (test_bit(bus, mp_bus_not_pci)) {
-               apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
-               return -1;
-       }
-       for (i = 0; i < mp_irq_entries; i++) {
-               int lbus = mp_irqs[i].srcbus;
-
-               for (apic = 0; apic < nr_ioapics; apic++)
-                       if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
-                           mp_irqs[i].dstapic == MP_APIC_ALL)
-                               break;
-
-               if (!test_bit(lbus, mp_bus_not_pci) &&
-                   !mp_irqs[i].irqtype &&
-                   (bus == lbus) &&
-                   (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
-                       int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
-
-                       if (!(apic || IO_APIC_IRQ(irq)))
-                               continue;
-
-                       if (pin == (mp_irqs[i].srcbusirq & 3))
-                               return irq;
-                       /*
-                        * Use the first all-but-pin matching entry as a
-                        * best-guess fuzzy result for broken mptables.
-                        */
-                       if (best_guess < 0)
-                               best_guess = irq;
-               }
-       }
-       return best_guess;
-}
-
-EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
-
  #if defined(CONFIG_EISA) || defined(CONFIG_MCA)
  /*
   * EISA Edge/Level control register, ELCR
@@ -1298,6 +1092,64 @@ static int pin_2_irq(int idx, int apic, int pin)
         return irq;
  }
  
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
+                               struct io_apic_irq_attr *irq_attr)
+{
+       int apic, i, best_guess = -1;
+
+       apic_printk(APIC_DEBUG,
+                   "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+                   bus, slot, pin);
+       if (test_bit(bus, mp_bus_not_pci)) {
+               apic_printk(APIC_VERBOSE,
+                           "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+               return -1;
+       }
+       for (i = 0; i < mp_irq_entries; i++) {
+               int lbus = mp_irqs[i].srcbus;
+
+               for (apic = 0; apic < nr_ioapics; apic++)
+                       if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
+                           mp_irqs[i].dstapic == MP_APIC_ALL)
+                               break;
+
+               if (!test_bit(lbus, mp_bus_not_pci) &&
+                   !mp_irqs[i].irqtype &&
+                   (bus == lbus) &&
+                   (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
+                       int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
+
+                       if (!(apic || IO_APIC_IRQ(irq)))
+                               continue;
+
+                       if (pin == (mp_irqs[i].srcbusirq & 3)) {
+                               set_io_apic_irq_attr(irq_attr, apic,
+                                                    mp_irqs[i].dstirq,
+                                                    irq_trigger(i),
+                                                    irq_polarity(i));
+                               return irq;
+                       }
+                       /*
+                        * Use the first all-but-pin matching entry as a
+                        * best-guess fuzzy result for broken mptables.
+                        */
+                       if (best_guess < 0) {
+                               set_io_apic_irq_attr(irq_attr, apic,
+                                                    mp_irqs[i].dstirq,
+                                                    irq_trigger(i),
+                                                    irq_polarity(i));
+                               best_guess = irq;
+                       }
+               }
+       }
+       return best_guess;
+}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
  void lock_vector_lock(void)
  {
         /* Used to the online set of cpus does not change
@@ -1628,58 +1480,70 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
         ioapic_write_entry(apic_id, pin, entry);
  }
  
+static struct {
+       DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} mp_ioapic_routing[MAX_IO_APICS];
+
  static void __init setup_IO_APIC_irqs(void)
  {
-       int apic_id, pin, idx, irq;
+       int apic_id = 0, pin, idx, irq;
         int notcon = 0;
         struct irq_desc *desc;
         struct irq_cfg *cfg;
-       int cpu = boot_cpu_id;
+       int node = cpu_to_node(boot_cpu_id);
  
         apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
  
-       for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
-               for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
-
-                       idx = find_irq_entry(apic_id, pin, mp_INT);
-                       if (idx == -1) {
-                               if (!notcon) {
-                                       notcon = 1;
-                                       apic_printk(APIC_VERBOSE,
-                                               KERN_DEBUG " %d-%d",
-                                               mp_ioapics[apic_id].apicid, pin);
-                               } else
-                                       apic_printk(APIC_VERBOSE, " %d-%d",
-                                               mp_ioapics[apic_id].apicid, pin);
-                               continue;
-                       }
-                       if (notcon) {
-                               apic_printk(APIC_VERBOSE,
-                                       " (apicid-pin) not connected\n");
-                               notcon = 0;
-                       }
+#ifdef CONFIG_ACPI
+       if (!acpi_disabled && acpi_ioapic) {
+               apic_id = mp_find_ioapic(0);
+               if (apic_id < 0)
+                       apic_id = 0;
+       }
+#endif
  
-                       irq = pin_2_irq(idx, apic_id, pin);
+       for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
+               idx = find_irq_entry(apic_id, pin, mp_INT);
+               if (idx == -1) {
+                       if (!notcon) {
+                               notcon = 1;
+                               apic_printk(APIC_VERBOSE,
+                                       KERN_DEBUG " %d-%d",
+                                       mp_ioapics[apic_id].apicid, pin);
+                       } else
+                               apic_printk(APIC_VERBOSE, " %d-%d",
+                                       mp_ioapics[apic_id].apicid, pin);
+                       continue;
+               }
+               if (notcon) {
+                       apic_printk(APIC_VERBOSE,
+                               " (apicid-pin) not connected\n");
+                       notcon = 0;
+               }
  
-                       /*
-                        * Skip the timer IRQ if there's a quirk handler
-                        * installed and if it returns 1:
-                        */
-                       if (apic->multi_timer_check &&
-                                       apic->multi_timer_check(apic_id, irq))
-                               continue;
+               irq = pin_2_irq(idx, apic_id, pin);
  
-                       desc = irq_to_desc_alloc_cpu(irq, cpu);
-                       if (!desc) {
-                               printk(KERN_INFO "can not get irq_desc for %d\n", irq);
-                               continue;
-                       }
-                       cfg = desc->chip_data;
-                       add_pin_to_irq_cpu(cfg, cpu, apic_id, pin);
+               /*
+                * Skip the timer IRQ if there's a quirk handler
+                * installed and if it returns 1:
+                */
+               if (apic->multi_timer_check &&
+                               apic->multi_timer_check(apic_id, irq))
+                       continue;
  
-                       setup_IO_APIC_irq(apic_id, pin, irq, desc,
-                                       irq_trigger(idx), irq_polarity(idx));
+               desc = irq_to_desc_alloc_node(irq, node);
+               if (!desc) {
+                       printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+                       continue;
                 }
+               cfg = desc->chip_data;
+               add_pin_to_irq_node(cfg, node, apic_id, pin);
+               /*
+                * don't mark it in pin_programmed, so later acpi could
+                * set it correctly when irq < 16
+                */
+               setup_IO_APIC_irq(apic_id, pin, irq, desc,
+                               irq_trigger(idx), irq_polarity(idx));
         }
  
         if (notcon)
@@ -1869,7 +1733,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base)
  
  __apicdebuginit(void) print_local_APIC(void *dummy)
  {
-       unsigned int v, ver, maxlvt;
+       unsigned int i, v, ver, maxlvt;
         u64 icr;
  
         if (apic_verbosity == APIC_QUIET)
@@ -1957,6 +1821,18 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
         printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
         v = apic_read(APIC_TDCR);
         printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+
+       if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+               v = apic_read(APIC_EFEAT);
+               maxlvt = (v >> 16) & 0xff;
+               printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v);
+               v = apic_read(APIC_ECTRL);
+               printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v);
+               for (i = 0; i < maxlvt; i++) {
+                       v = apic_read(APIC_EILVTn(i));
+                       printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
+               }
+       }
         printk("\n");
  }
  
@@ -2005,6 +1881,11 @@ __apicdebuginit(void) print_PIC(void)
  __apicdebuginit(int) print_all_ICs(void)
  {
         print_PIC();
+
+       /* don't print out if apic is not there */
+       if (!cpu_has_apic || disable_apic)
+               return 0;
+
         print_all_local_APICs();
         print_IO_APIC();
  
@@ -2360,9 +2241,121 @@ static int ioapic_retrigger_irq(unsigned int irq)
   */
  
  #ifdef CONFIG_SMP
+static void send_cleanup_vector(struct irq_cfg *cfg)
+{
+       cpumask_var_t cleanup_mask;
+
+       if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+               unsigned int i;
+               cfg->move_cleanup_count = 0;
+               for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+                       cfg->move_cleanup_count++;
+               for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+                       apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+       } else {
+               cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+               cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
+               apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+               free_cpumask_var(cleanup_mask);
+       }
+       cfg->move_in_progress = 0;
+}
+
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
+{
+       int apic, pin;
+       struct irq_pin_list *entry;
+       u8 vector = cfg->vector;
+
+       entry = cfg->irq_2_pin;
+       for (;;) {
+               unsigned int reg;
+
+               if (!entry)
+                       break;
+
+               apic = entry->apic;
+               pin = entry->pin;
+               /*
+                * With interrupt-remapping, destination information comes
+                * from interrupt-remapping table entry.
+                */
+               if (!irq_remapped(irq))
+                       io_apic_write(apic, 0x11 + pin*2, dest);
+               reg = io_apic_read(apic, 0x10 + pin*2);
+               reg &= ~IO_APIC_REDIR_VECTOR_MASK;
+               reg |= vector;
+               io_apic_modify(apic, 0x10 + pin*2, reg);
+               if (!entry->next)
+                       break;
+               entry = entry->next;
+       }
+}
+
+static int
+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
+
+/*
+ * Either sets desc->affinity to a valid value, and returns
+ * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
+ * leaves desc->affinity untouched.
+ */
+static unsigned int
+set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
+{
+       struct irq_cfg *cfg;
+       unsigned int irq;
+
+       if (!cpumask_intersects(mask, cpu_online_mask))
+               return BAD_APICID;
+
+       irq = desc->irq;
+       cfg = desc->chip_data;
+       if (assign_irq_vector(irq, cfg, mask))
+               return BAD_APICID;
+
+       cpumask_copy(desc->affinity, mask);
+
+       return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
+}
+
+static int
+set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
+       struct irq_cfg *cfg;
+       unsigned long flags;
+       unsigned int dest;
+       unsigned int irq;
+       int ret = -1;
+
+       irq = desc->irq;
+       cfg = desc->chip_data;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       dest = set_desc_affinity(desc, mask);
+       if (dest != BAD_APICID) {
+               /* Only the high 8 bits are valid. */
+               dest = SET_APIC_LOGICAL_ID(dest);
+               __target_IO_APIC_irq(irq, dest, cfg);
+               ret = 0;
+       }
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       return ret;
+}
+
+static int
+set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
+{
+       struct irq_desc *desc;
+
+       desc = irq_to_desc(irq);
+
+       return set_ioapic_affinity_irq_desc(desc, mask);
+}
+
+#ifdef CONFIG_INTR_REMAP
  
-#ifdef CONFIG_INTR_REMAP
-
  /*
   * Migrate the IO-APIC irq in the presence of intr-remapping.
   *
@@ -2374,26 +2367,25 @@ static int ioapic_retrigger_irq(unsigned int irq)
   * Real vector that is used for interrupting cpu will be coming from
   * the interrupt-remapping table entry.
   */
-static void
+static int
  migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
  {
         struct irq_cfg *cfg;
         struct irte irte;
         unsigned int dest;
         unsigned int irq;
+       int ret = -1;
  
         if (!cpumask_intersects(mask, cpu_online_mask))
-               return;
+               return ret;
  
         irq = desc->irq;
         if (get_irte(irq, &irte))
-               return;
+               return ret;
  
         cfg = desc->chip_data;
         if (assign_irq_vector(irq, cfg, mask))
-               return;
-
-       set_extra_move_desc(desc, mask);
+               return ret;
  
         dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
  
@@ -2409,27 +2401,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
                 send_cleanup_vector(cfg);
  
         cpumask_copy(desc->affinity, mask);
+
+       return 0;
  }
  
  /*
   * Migrates the IRQ destination in the process context.
   */
-static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
                                             const struct cpumask *mask)
  {
-       migrate_ioapic_irq_desc(desc, mask);
+       return migrate_ioapic_irq_desc(desc, mask);
  }
-static void set_ir_ioapic_affinity_irq(unsigned int irq,
+static int set_ir_ioapic_affinity_irq(unsigned int irq,
                                        const struct cpumask *mask)
  {
         struct irq_desc *desc = irq_to_desc(irq);
  
-       set_ir_ioapic_affinity_irq_desc(desc, mask);
+       return set_ir_ioapic_affinity_irq_desc(desc, mask);
  }
  #else
-static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
                                                    const struct cpumask *mask)
  {
+       return 0;
  }
  #endif
  
@@ -2491,86 +2486,19 @@ static void irq_complete_move(struct irq_desc **descp)
         struct irq_cfg *cfg = desc->chip_data;
         unsigned vector, me;
  
-       if (likely(!cfg->move_in_progress)) {
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-               if (likely(!cfg->move_desc_pending))
-                       return;
-
-               /* domain has not changed, but affinity did */
-               me = smp_processor_id();
-               if (cpumask_test_cpu(me, desc->affinity)) {
-                       *descp = desc = move_irq_desc(desc, me);
-                       /* get the new one */
-                       cfg = desc->chip_data;
-                       cfg->move_desc_pending = 0;
-               }
-#endif
+       if (likely(!cfg->move_in_progress))
                 return;
-       }
  
         vector = ~get_irq_regs()->orig_ax;
         me = smp_processor_id();
  
-       if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) {
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-               *descp = desc = move_irq_desc(desc, me);
-               /* get the new one */
-               cfg = desc->chip_data;
-#endif
+       if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
                 send_cleanup_vector(cfg);
-       }
  }
  #else
  static inline void irq_complete_move(struct irq_desc **descp) {}
  #endif
  
-static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
-{
-       int apic, pin;
-       struct irq_pin_list *entry;
-
-       entry = cfg->irq_2_pin;
-       for (;;) {
-
-               if (!entry)
-                       break;
-
-               apic = entry->apic;
-               pin = entry->pin;
-               io_apic_eoi(apic, pin);
-               entry = entry->next;
-       }
-}
-
-static void
-eoi_ioapic_irq(struct irq_desc *desc)
-{
-       struct irq_cfg *cfg;
-       unsigned long flags;
-       unsigned int irq;
-
-       irq = desc->irq;
-       cfg = desc->chip_data;
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       __eoi_ioapic_irq(irq, cfg);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-#ifdef CONFIG_X86_X2APIC
-static void ack_x2apic_level(unsigned int irq)
-{
-       struct irq_desc *desc = irq_to_desc(irq);
-       ack_x2APIC_irq();
-       eoi_ioapic_irq(desc);
-}
-
-static void ack_x2apic_edge(unsigned int irq)
-{
-       ack_x2APIC_irq();
-}
-#endif
-
  static void ack_apic_edge(unsigned int irq)
  {
         struct irq_desc *desc = irq_to_desc(irq);
@@ -2634,9 +2562,6 @@ static void ack_apic_level(unsigned int irq)
          */
         ack_APIC_irq();
  
-       if (irq_remapped(irq))
-               eoi_ioapic_irq(desc);
-
         /* Now we can move and renable the irq */
         if (unlikely(do_unmask_irq)) {
                 /* Only migrate the irq if the ack has been received.
@@ -2683,22 +2608,50 @@ static void ack_apic_level(unsigned int irq)
  }
  
  #ifdef CONFIG_INTR_REMAP
+static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+       int apic, pin;
+       struct irq_pin_list *entry;
+
+       entry = cfg->irq_2_pin;
+       for (;;) {
+
+               if (!entry)
+                       break;
+
+               apic = entry->apic;
+               pin = entry->pin;
+               io_apic_eoi(apic, pin);
+               entry = entry->next;
+       }
+}
+
+static void
+eoi_ioapic_irq(struct irq_desc *desc)
+{
+       struct irq_cfg *cfg;
+       unsigned long flags;
+       unsigned int irq;
+
+       irq = desc->irq;
+       cfg = desc->chip_data;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       __eoi_ioapic_irq(irq, cfg);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
  static void ir_ack_apic_edge(unsigned int irq)
  {
-#ifdef CONFIG_X86_X2APIC
-       if (x2apic_enabled())
-               return ack_x2apic_edge(irq);
-#endif
-       return ack_apic_edge(irq);
+       ack_APIC_irq();
  }
  
  static void ir_ack_apic_level(unsigned int irq)
  {
-#ifdef CONFIG_X86_X2APIC
-       if (x2apic_enabled())
-               return ack_x2apic_level(irq);
-#endif
-       return ack_apic_level(irq);
+       struct irq_desc *desc = irq_to_desc(irq);
+
+       ack_APIC_irq();
+       eoi_ioapic_irq(desc);
  }
  #endif /* CONFIG_INTR_REMAP */
  
@@ -2903,7 +2856,7 @@ static inline void __init check_timer(void)
  {
         struct irq_desc *desc = irq_to_desc(0);
         struct irq_cfg *cfg = desc->chip_data;
-       int cpu = boot_cpu_id;
+       int node = cpu_to_node(boot_cpu_id);
         int apic1, pin1, apic2, pin2;
         unsigned long flags;
         int no_pin1 = 0;
@@ -2969,7 +2922,7 @@ static inline void __init check_timer(void)
                  * Ok, does IRQ0 through the IOAPIC work?
                  */
                 if (no_pin1) {
-                       add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
+                       add_pin_to_irq_node(cfg, node, apic1, pin1);
                         setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
                 } else {
                         /* for edge trigger, setup_IO_APIC_irq already
@@ -3006,7 +2959,7 @@ static inline void __init check_timer(void)
                 /*
                  * legacy devices should be connected to IO APIC #0
                  */
-               replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
+               replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
                 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
                 enable_8259A_irq(0);
                 if (timer_irq_works()) {
@@ -3218,14 +3171,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY;
  /*
   * Dynamic irq allocate and deallocation
   */
-unsigned int create_irq_nr(unsigned int irq_want)
+unsigned int create_irq_nr(unsigned int irq_want, int node)
  {
         /* Allocate an unused irq */
         unsigned int irq;
         unsigned int new;
         unsigned long flags;
         struct irq_cfg *cfg_new = NULL;
-       int cpu = boot_cpu_id;
         struct irq_desc *desc_new = NULL;
  
         irq = 0;
@@ -3234,7 +3186,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
  
         spin_lock_irqsave(&vector_lock, flags);
         for (new = irq_want; new < nr_irqs; new++) {
-               desc_new = irq_to_desc_alloc_cpu(new, cpu);
+               desc_new = irq_to_desc_alloc_node(new, node);
                 if (!desc_new) {
                         printk(KERN_INFO "can not get irq_desc for %d\n", new);
                         continue;
@@ -3243,6 +3195,9 @@ unsigned int create_irq_nr(unsigned int irq_want)
  
                 if (cfg_new->vector != 0)
                         continue;
+
+               desc_new = move_irq_desc(desc_new, node);
+
                 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
                         irq = new;
                 break;
@@ -3260,11 +3215,12 @@ unsigned int create_irq_nr(unsigned int irq_want)
  
  int create_irq(void)
  {
+       int node = cpu_to_node(boot_cpu_id);
         unsigned int irq_want;
         int irq;
  
         irq_want = nr_irqs_gsi;
-       irq = create_irq_nr(irq_want);
+       irq = create_irq_nr(irq_want, node);
  
         if (irq == 0)
                 irq = -1;
@@ -3366,7 +3322,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
  }
  
  #ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
  {
         struct irq_desc *desc = irq_to_desc(irq);
         struct irq_cfg *cfg;
@@ -3375,7 +3331,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
  
         dest = set_desc_affinity(desc, mask);
         if (dest == BAD_APICID)
-               return;
+               return -1;
  
         cfg = desc->chip_data;
  
@@ -3387,13 +3343,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
         msg.address_lo |= MSI_ADDR_DEST_ID(dest);
  
         write_msi_msg_desc(desc, &msg);
+
+       return 0;
  }
  #ifdef CONFIG_INTR_REMAP
  /*
   * Migrate the MSI irq to another cpumask. This migration is
   * done in the process context using interrupt-remapping hardware.
   */
-static void
+static int
  ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
  {
         struct irq_desc *desc = irq_to_desc(irq);
@@ -3402,11 +3360,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
         struct irte irte;
  
         if (get_irte(irq, &irte))
-               return;
+               return -1;
  
         dest = set_desc_affinity(desc, mask);
         if (dest == BAD_APICID)
-               return;
+               return -1;
  
         irte.vector = cfg->vector;
         irte.dest_id = IRTE_DEST(dest);
@@ -3423,6 +3381,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
          */
         if (cfg->move_in_progress)
                 send_cleanup_vector(cfg);
+
+       return 0;
  }
  
  #endif
@@ -3518,15 +3478,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
         unsigned int irq_want;
         struct intel_iommu *iommu = NULL;
         int index = 0;
+       int node;
  
         /* x86 doesn't support multiple MSI yet */
         if (type == PCI_CAP_ID_MSI && nvec > 1)
                 return 1;
  
+       node = dev_to_node(&dev->dev);
         irq_want = nr_irqs_gsi;
         sub_handle = 0;
         list_for_each_entry(msidesc, &dev->msi_list, list) {
-               irq = create_irq_nr(irq_want);
+               irq = create_irq_nr(irq_want, node);
                 if (irq == 0)
                         return -1;
                 irq_want = irq + 1;
@@ -3576,7 +3538,7 @@ void arch_teardown_msi_irq(unsigned int irq)
  
  #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
  #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
  {
         struct irq_desc *desc = irq_to_desc(irq);
         struct irq_cfg *cfg;
@@ -3585,7 +3547,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
  
         dest = set_desc_affinity(desc, mask);
         if (dest == BAD_APICID)
-               return;
+               return -1;
  
         cfg = desc->chip_data;
  
@@ -3597,6 +3559,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
         msg.address_lo |= MSI_ADDR_DEST_ID(dest);
  
         dmar_msi_write(irq, &msg);
+
+       return 0;
  }
  
  #endif /* CONFIG_SMP */
@@ -3630,7 +3594,7 @@ int arch_setup_dmar_msi(unsigned int irq)
  #ifdef CONFIG_HPET_TIMER
  
  #ifdef CONFIG_SMP
-static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
  {
         struct irq_desc *desc = irq_to_desc(irq);
         struct irq_cfg *cfg;
@@ -3639,7 +3603,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
  
         dest = set_desc_affinity(desc, mask);
         if (dest == BAD_APICID)
-               return;
+               return -1;
  
         cfg = desc->chip_data;
  
@@ -3651,6 +3615,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
         msg.address_lo |= MSI_ADDR_DEST_ID(dest);
  
         hpet_msi_write(irq, &msg);
+
+       return 0;
  }
  
  #endif /* CONFIG_SMP */
@@ -3707,7 +3673,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
         write_ht_irq_msg(irq, &msg);
  }
  
-static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
  {
         struct irq_desc *desc = irq_to_desc(irq);
         struct irq_cfg *cfg;
@@ -3715,11 +3681,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
  
         dest = set_desc_affinity(desc, mask);
         if (dest == BAD_APICID)
-               return;
+               return -1;
  
         cfg = desc->chip_data;
  
         target_ht_irq(irq, dest, cfg->vector);
+
+       return 0;
  }
  
  #endif
@@ -3794,6 +3762,8 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
         unsigned long flags;
         int err;
  
+       BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
         cfg = irq_cfg(irq);
  
         err = assign_irq_vector(irq, cfg, eligible_cpu);
@@ -3807,15 +3777,13 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
  
         mmr_value = 0;
         entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-       BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-
-       entry->vector = cfg->vector;
-       entry->delivery_mode = apic->irq_delivery_mode;
-       entry->dest_mode = apic->irq_dest_mode;
-       entry->polarity = 0;
-       entry->trigger = 0;
-       entry->mask = 0;
-       entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
+       entry->vector           = cfg->vector;
+       entry->delivery_mode    = apic->irq_delivery_mode;
+       entry->dest_mode        = apic->irq_dest_mode;
+       entry->polarity         = 0;
+       entry->trigger          = 0;
+       entry->mask             = 0;
+       entry->dest             = apic->cpu_mask_to_apicid(eligible_cpu);
  
         mmr_pnode = uv_blade_to_pnode(mmr_blade);
         uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3833,10 +3801,10 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
         struct uv_IO_APIC_route_entry *entry;
         int mmr_pnode;
  
+       BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
         mmr_value = 0;
         entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-       BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-
         entry->mask = 1;
  
         mmr_pnode = uv_blade_to_pnode(mmr_blade);
@@ -3900,6 +3868,71 @@ int __init arch_probe_nr_irqs(void)
  }
  #endif
  
+static int __io_apic_set_pci_routing(struct device *dev, int irq,
+                               struct io_apic_irq_attr *irq_attr)
+{
+       struct irq_desc *desc;
+       struct irq_cfg *cfg;
+       int node;
+       int ioapic, pin;
+       int trigger, polarity;
+
+       ioapic = irq_attr->ioapic;
+       if (!IO_APIC_IRQ(irq)) {
+               apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+                       ioapic);
+               return -EINVAL;
+       }
+
+       if (dev)
+               node = dev_to_node(dev);
+       else
+               node = cpu_to_node(boot_cpu_id);
+
+       desc = irq_to_desc_alloc_node(irq, node);
+       if (!desc) {
+               printk(KERN_INFO "can not get irq_desc %d\n", irq);
+               return 0;
+       }
+
+       pin = irq_attr->ioapic_pin;
+       trigger = irq_attr->trigger;
+       polarity = irq_attr->polarity;
+
+       /*
+        * IRQs < 16 are already in the irq_2_pin[] map
+        */
+       if (irq >= NR_IRQS_LEGACY) {
+               cfg = desc->chip_data;
+               add_pin_to_irq_node(cfg, node, ioapic, pin);
+       }
+
+       setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
+
+       return 0;
+}
+
+int io_apic_set_pci_routing(struct device *dev, int irq,
+                               struct io_apic_irq_attr *irq_attr)
+{
+       int ioapic, pin;
+       /*
+        * Avoid pin reprogramming.  PRTs typically include entries
+        * with redundant pin->gsi mappings (but unique PCI devices);
+        * we only program the IOAPIC on the first.
+        */
+       ioapic = irq_attr->ioapic;
+       pin = irq_attr->ioapic_pin;
+       if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
+               pr_debug("Pin %d-%d already programmed\n",
+                        mp_ioapics[ioapic].apicid, pin);
+               return 0;
+       }
+       set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
+
+       return __io_apic_set_pci_routing(dev, irq, irq_attr);
+}
+
  /* --------------------------------------------------------------------------
                            ACPI-based IOAPIC Configuration
     -------------------------------------------------------------------------- */
@@ -3980,6 +4013,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
  
         return apic_id;
  }
+#endif
  
  int __init io_apic_get_version(int ioapic)
  {
@@ -3992,39 +4026,6 @@ int __init io_apic_get_version(int ioapic)
  
         return reg_01.bits.version;
  }
-#endif
-
-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
-{
-       struct irq_desc *desc;
-       struct irq_cfg *cfg;
-       int cpu = boot_cpu_id;
-
-       if (!IO_APIC_IRQ(irq)) {
-               apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
-                       ioapic);
-               return -EINVAL;
-       }
-
-       desc = irq_to_desc_alloc_cpu(irq, cpu);
-       if (!desc) {
-               printk(KERN_INFO "can not get irq_desc %d\n", irq);
-               return 0;
-       }
-
-       /*
-        * IRQs < 16 are already in the irq_2_pin[] map
-        */
-       if (irq >= NR_IRQS_LEGACY) {
-               cfg = desc->chip_data;
-               add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
-       }
-
-       setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
-
-       return 0;
-}
-
  
  int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
  {
@@ -4055,51 +4056,44 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
  #ifdef CONFIG_SMP
  void __init setup_ioapic_dest(void)
  {
-       int pin, ioapic, irq, irq_entry;
+       int pin, ioapic = 0, irq, irq_entry;
         struct irq_desc *desc;
-       struct irq_cfg *cfg;
         const struct cpumask *mask;
  
         if (skip_ioapic_setup == 1)
                 return;
  
-       for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
-               for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
-                       irq_entry = find_irq_entry(ioapic, pin, mp_INT);
-                       if (irq_entry == -1)
-                               continue;
-                       irq = pin_2_irq(irq_entry, ioapic, pin);
-
-                       /* setup_IO_APIC_irqs could fail to get vector for some device
-                        * when you have too many devices, because at that time only boot
-                        * cpu is online.
-                        */
-                       desc = irq_to_desc(irq);
-                       cfg = desc->chip_data;
-                       if (!cfg->vector) {
-                               setup_IO_APIC_irq(ioapic, pin, irq, desc,
-                                                 irq_trigger(irq_entry),
-                                                 irq_polarity(irq_entry));
-                               continue;
+#ifdef CONFIG_ACPI
+       if (!acpi_disabled && acpi_ioapic) {
+               ioapic = mp_find_ioapic(0);
+               if (ioapic < 0)
+                       ioapic = 0;
+       }
+#endif
  
-                       }
+       for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+               irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+               if (irq_entry == -1)
+                       continue;
+               irq = pin_2_irq(irq_entry, ioapic, pin);
  
-                       /*
-                        * Honour affinities which have been set in early boot
-                        */
-                       if (desc->status &
-                           (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
-                               mask = desc->affinity;
-                       else
-                               mask = apic->target_cpus();
+               desc = irq_to_desc(irq);
  
-                       if (intr_remapping_enabled)
-                               set_ir_ioapic_affinity_irq_desc(desc, mask);
-                       else
-                               set_ioapic_affinity_irq_desc(desc, mask);
-               }
+               /*
+                * Honour affinities which have been set in early boot
+                */
+               if (desc->status &
+                   (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
+                       mask = desc->affinity;
+               else
+                       mask = apic->target_cpus();
  
+               if (intr_remapping_enabled)
+                       set_ir_ioapic_affinity_irq_desc(desc, mask);
+               else
+                       set_ioapic_affinity_irq_desc(desc, mask);
         }
+
  }
  #endif
  
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c

index ce4fbfa315a16ac1f4e45f34281121577f5477a6..a691302dc3ffa4e5e4771df1fedf17919d7133c6 100644 (file)
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data)
  }
  #endif
  
-static void report_broken_nmi(int cpu, int *prev_nmi_count)
+static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
  {
         printk(KERN_CONT "\n");
  
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c

index 01eda2ac65e4d9620b19979a0a24ef4636cae889..440a8bccd91ad7ac8ae326f48fb30cab6e007b20 100644 (file)
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -160,7 +160,6 @@ extern struct apic apic_summit;
  extern struct apic apic_bigsmp;
  extern struct apic apic_es7000;
  extern struct apic apic_es7000_cluster;
-extern struct apic apic_default;
  
  struct apic *apic = &apic_default;
  EXPORT_SYMBOL_GPL(apic);
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c

index 1783652bb0e56c8140e411a4ba5643a64ab4d0fc..bc3e880f9b82e76902b305d10920b70b05c440ac 100644 (file)
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = {
  void __init default_setup_apic_routing(void)
  {
  #ifdef CONFIG_X86_X2APIC
-       if (x2apic && (apic != &apic_x2apic_phys &&
+       if (x2apic_mode && (apic != &apic_x2apic_phys &&
  #ifdef CONFIG_X86_UV
                        apic != &apic_x2apic_uv_x &&
  #endif
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c

index 9cfe1f415d81f3659f1f68b8f88dcb4365be3ff7..344eee4ac0a48242d64db5676ec38fbe92d42c73 100644 (file)
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -173,13 +173,6 @@ static inline int is_WPEG(struct rio_detail *rio){
                 rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
  }
  
-
-/* In clustered mode, the high nibble of APIC ID is a cluster number.
- * The low nibble is a 4-bit bitmap. */
-#define XAPIC_DEST_CPUS_SHIFT  4
-#define XAPIC_DEST_CPUS_MASK   ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
-#define XAPIC_DEST_CLUSTER_MASK        (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
-
  #define SUMMIT_APIC_DFR_VALUE  (APIC_DFR_CLUSTER)
  
  static const struct cpumask *summit_target_cpus(void)
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c

index 4a903e2f0d179d68d6a30f5afe7c1b0543a45870..8e4cbb255c38d289f78083828845da5ae43b06ac 100644 (file)
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -10,7 +10,7 @@
  #include <asm/apic.h>
  #include <asm/ipi.h>
  
-DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
  
  static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
  {
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c

index 2bda693529762b71f30ae2c26fa08a7692ca5cb5..ef0ae207a7c82cb64f7a0a7758a91dc22b0f8fee 100644 (file)
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -105,7 +105,7 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
         cpumask_set_cpu(cpu, retmask);
  }
  
-static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
  {
  #ifdef CONFIG_SMP
         unsigned long val;
@@ -562,7 +562,7 @@ void __init uv_system_init(void)
         union uvh_node_id_u node_id;
         unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
         int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
-       int max_pnode = 0;
+       int gnode_extra, max_pnode = 0;
         unsigned long mmr_base, present, paddr;
         unsigned short pnode_mask;
  
@@ -574,6 +574,13 @@ void __init uv_system_init(void)
         mmr_base =
             uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
             ~UV_MMR_ENABLE;
+       pnode_mask = (1 << n_val) - 1;
+       node_id.v = uv_read_local_mmr(UVH_NODE_ID);
+       gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
+       gnode_upper = ((unsigned long)gnode_extra  << m_val);
+       printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
+                       n_val, m_val, gnode_upper, gnode_extra);
+
         printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
  
         for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
@@ -583,15 +590,18 @@ void __init uv_system_init(void)
  
         bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
         uv_blade_info = kmalloc(bytes, GFP_KERNEL);
+       BUG_ON(!uv_blade_info);
  
         get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
  
         bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
         uv_node_to_blade = kmalloc(bytes, GFP_KERNEL);
+       BUG_ON(!uv_node_to_blade);
         memset(uv_node_to_blade, 255, bytes);
  
         bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus();
         uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL);
+       BUG_ON(!uv_cpu_to_blade);
         memset(uv_cpu_to_blade, 255, bytes);
  
         blade = 0;
@@ -607,11 +617,6 @@ void __init uv_system_init(void)
                 }
         }
  
-       pnode_mask = (1 << n_val) - 1;
-       node_id.v = uv_read_local_mmr(UVH_NODE_ID);
-       gnode_upper = (((unsigned long)node_id.s.node_id) &
-                      ~((1 << n_val) - 1)) << m_val;
-
         uv_bios_init();
         uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
                             &sn_coherency_id, &sn_region_size);
@@ -634,6 +639,7 @@ void __init uv_system_init(void)
                 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
                 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
                 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
+               uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
                 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
                 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
                 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c

index 5a6aa1c1162f06492a27cee3ac754b16e7fa57e0..1a830cbd70153b8662eddf16a87a5336c6cb0742 100644 (file)
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -146,4 +146,5 @@ void foo(void)
         OFFSET(BP_loadflags, boot_params, hdr.loadflags);
         OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
         OFFSET(BP_version, boot_params, hdr.version);
+       OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
  }
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c

index e72f062fb4b5fd841617a3e7f7c766e64bf4428f..898ecc47e129e8f8e0848287a518f2af137f022b 100644 (file)
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -125,6 +125,7 @@ int main(void)
         OFFSET(BP_loadflags, boot_params, hdr.loadflags);
         OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
         OFFSET(BP_version, boot_params, hdr.version);
+       OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
  
         BLANK();
         DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile

index 4e242f9a06e43a3bc479e40237997b3c0e1a53c4..3efcb2b96a150e361f9a1f66ccbfaef692b98432 100644 (file)
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
  #
-# Makefile for x86-compatible CPU details and quirks
+# Makefile for x86-compatible CPU details, features and quirks
  #
  
  # Don't trace early stages of a secondary CPU boot
@@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)               += centaur.o
  obj-$(CONFIG_CPU_SUP_TRANSMETA_32)     += transmeta.o
  obj-$(CONFIG_CPU_SUP_UMC_32)           += umc.o
  
-obj-$(CONFIG_X86_MCE)  += mcheck/
-obj-$(CONFIG_MTRR)     += mtrr/
-obj-$(CONFIG_CPU_FREQ) += cpufreq/
+obj-$(CONFIG_PERF_COUNTERS)            += perf_counter.o
  
-obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+obj-$(CONFIG_X86_MCE)                  += mcheck/
+obj-$(CONFIG_MTRR)                     += mtrr/
+obj-$(CONFIG_CPU_FREQ)                 += cpufreq/
+
+obj-$(CONFIG_X86_LOCAL_APIC)           += perfctr-watchdog.o
  
  quiet_cmd_mkcapflags = MKCAP   $@
        cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c

index 7e4a459daa644f30309f50e9ca65ece4be85fad7..e5b27d8f1b47395fda285d867b776e82454ae61a 100644 (file)
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -6,6 +6,7 @@
  #include <asm/processor.h>
  #include <asm/apic.h>
  #include <asm/cpu.h>
+#include <asm/pci-direct.h>
  
  #ifdef CONFIG_X86_64
  # include <asm/numa_64.h>
@@ -272,7 +273,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
  #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
         int cpu = smp_processor_id();
         int node;
-       unsigned apicid = hard_smp_processor_id();
+       unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
  
         node = c->phys_proc_id;
         if (apicid_to_node[apicid] != NUMA_NO_NODE)
@@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
                     (c->x86_model == 8 && c->x86_mask >= 8))
                         set_cpu_cap(c, X86_FEATURE_K6_MTRR);
  #endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
+       /* check CPU config space for extended APIC ID */
+       if (c->x86 >= 0xf) {
+               unsigned int val;
+               val = read_pci_config(0, 24, 0, 0x68);
+               if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
+                       set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+       }
+#endif
  }
  
  static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

index 77848d9fca6833fa88488adb6e22cae027a7d6c1..3ffdcfa9abdf07accfa466518b8ac1adf16a9c05 100644 (file)
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
  #include <linux/io.h>
  
  #include <asm/stackprotector.h>
+#include <asm/perf_counter.h>
  #include <asm/mmu_context.h>
  #include <asm/hypervisor.h>
  #include <asm/processor.h>
@@ -299,7 +300,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
         return NULL;            /* Not found */
  }
  
-__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
+__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
  
  void load_percpu_segment(int cpu)
  {
@@ -768,6 +770,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
         if (this_cpu->c_identify)
                 this_cpu->c_identify(c);
  
+       /* Clear/Set all flags overriden by options, after probe */
+       for (i = 0; i < NCAPINTS; i++) {
+               c->x86_capability[i] &= ~cpu_caps_cleared[i];
+               c->x86_capability[i] |= cpu_caps_set[i];
+       }
+
  #ifdef CONFIG_X86_64
         c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
  #endif
@@ -813,6 +821,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
  #endif
  
         init_hypervisor(c);
+
+       /*
+        * Clear/Set all flags overriden by options, need do it
+        * before following smp all cpus cap AND.
+        */
+       for (i = 0; i < NCAPINTS; i++) {
+               c->x86_capability[i] &= ~cpu_caps_cleared[i];
+               c->x86_capability[i] |= cpu_caps_set[i];
+       }
+
         /*
          * On SMP, boot_cpu_data holds the common feature set between
          * all CPUs; so make sure that we indicate which features are
@@ -825,10 +843,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
                         boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
         }
  
-       /* Clear all flags overriden by options */
-       for (i = 0; i < NCAPINTS; i++)
-               c->x86_capability[i] &= ~cleared_cpu_caps[i];
-
  #ifdef CONFIG_X86_MCE
         /* Init Machine Check Exception if available. */
         mcheck_init(c);
@@ -861,6 +875,7 @@ void __init identify_boot_cpu(void)
  #else
         vgetcpu_set_mode();
  #endif
+       init_hw_perf_counters();
  }
  
  void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c

index 46e29ab96c6ac9d6765f0f1a89e7c043ed2f5cbd..6b2a52dd040398f36374926956875078c3e48f95 100644 (file)
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -32,9 +32,7 @@
  
  static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
  static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
-static DEFINE_PER_CPU(unsigned, cpu_modelflag);
  static DEFINE_PER_CPU(int, cpu_priv_count);
-static DEFINE_PER_CPU(unsigned, cpu_model);
  
  static DEFINE_MUTEX(cpu_debug_lock);
  
@@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = {
         { "value",      CPU_REG_ALL,    1       },
  };
  
-/* Intel Registers Range */
-static struct cpu_debug_range cpu_intel_range[] = {
-       { 0x00000000, 0x00000001, CPU_MC,       CPU_INTEL_ALL           },
-       { 0x00000006, 0x00000007, CPU_MONITOR,  CPU_CX_AT_XE            },
-       { 0x00000010, 0x00000010, CPU_TIME,     CPU_INTEL_ALL           },
-       { 0x00000011, 0x00000013, CPU_PMC,      CPU_INTEL_PENTIUM       },
-       { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE         },
-       { 0x0000001B, 0x0000001B, CPU_APIC,     CPU_P6_CX_AT_XE         },
-
-       { 0x0000002A, 0x0000002A, CPU_POWERON,  CPU_PX_CX_AT_XE         },
-       { 0x0000002B, 0x0000002B, CPU_POWERON,  CPU_INTEL_XEON          },
-       { 0x0000002C, 0x0000002C, CPU_FREQ,     CPU_INTEL_XEON          },
-       { 0x0000003A, 0x0000003A, CPU_CONTROL,  CPU_CX_AT_XE            },
-
-       { 0x00000040, 0x00000043, CPU_LBRANCH,  CPU_PM_CX_AT_XE         },
-       { 0x00000044, 0x00000047, CPU_LBRANCH,  CPU_PM_CO_AT            },
-       { 0x00000060, 0x00000063, CPU_LBRANCH,  CPU_C2_AT               },
-       { 0x00000064, 0x00000067, CPU_LBRANCH,  CPU_INTEL_ATOM          },
-
-       { 0x00000079, 0x00000079, CPU_BIOS,     CPU_P6_CX_AT_XE         },
-       { 0x00000088, 0x0000008A, CPU_CACHE,    CPU_INTEL_P6            },
-       { 0x0000008B, 0x0000008B, CPU_BIOS,     CPU_P6_CX_AT_XE         },
-       { 0x0000009B, 0x0000009B, CPU_MONITOR,  CPU_INTEL_XEON          },
-
-       { 0x000000C1, 0x000000C2, CPU_PMC,      CPU_P6_CX_AT            },
-       { 0x000000CD, 0x000000CD, CPU_FREQ,     CPU_CX_AT               },
-       { 0x000000E7, 0x000000E8, CPU_PERF,     CPU_CX_AT               },
-       { 0x000000FE, 0x000000FE, CPU_MTRR,     CPU_P6_CX_XE            },
-
-       { 0x00000116, 0x00000116, CPU_CACHE,    CPU_INTEL_P6            },
-       { 0x00000118, 0x00000118, CPU_CACHE,    CPU_INTEL_P6            },
-       { 0x00000119, 0x00000119, CPU_CACHE,    CPU_INTEL_PX            },
-       { 0x0000011A, 0x0000011B, CPU_CACHE,    CPU_INTEL_P6            },
-       { 0x0000011E, 0x0000011E, CPU_CACHE,    CPU_PX_CX_AT            },
-
-       { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE         },
-       { 0x00000179, 0x0000017A, CPU_MC,       CPU_PX_CX_AT_XE         },
-       { 0x0000017B, 0x0000017B, CPU_MC,       CPU_P6_XE               },
-       { 0x00000186, 0x00000187, CPU_PMC,      CPU_P6_CX_AT            },
-       { 0x00000198, 0x00000199, CPU_PERF,     CPU_PM_CX_AT_XE         },
-       { 0x0000019A, 0x0000019A, CPU_TIME,     CPU_PM_CX_AT_XE         },
-       { 0x0000019B, 0x0000019D, CPU_THERM,    CPU_PM_CX_AT_XE         },
-       { 0x000001A0, 0x000001A0, CPU_MISC,     CPU_PM_CX_AT_XE         },
-
-       { 0x000001C9, 0x000001C9, CPU_LBRANCH,  CPU_PM_CX_AT            },
-       { 0x000001D7, 0x000001D8, CPU_LBRANCH,  CPU_INTEL_XEON          },
-       { 0x000001D9, 0x000001D9, CPU_DEBUG,    CPU_CX_AT_XE            },
-       { 0x000001DA, 0x000001DA, CPU_LBRANCH,  CPU_INTEL_XEON          },
-       { 0x000001DB, 0x000001DB, CPU_LBRANCH,  CPU_P6_XE               },
-       { 0x000001DC, 0x000001DC, CPU_LBRANCH,  CPU_INTEL_P6            },
-       { 0x000001DD, 0x000001DE, CPU_LBRANCH,  CPU_PX_CX_AT_XE         },
-       { 0x000001E0, 0x000001E0, CPU_LBRANCH,  CPU_INTEL_P6            },
-
-       { 0x00000200, 0x0000020F, CPU_MTRR,     CPU_P6_CX_XE            },
-       { 0x00000250, 0x00000250, CPU_MTRR,     CPU_P6_CX_XE            },
-       { 0x00000258, 0x00000259, CPU_MTRR,     CPU_P6_CX_XE            },
-       { 0x00000268, 0x0000026F, CPU_MTRR,     CPU_P6_CX_XE            },
-       { 0x00000277, 0x00000277, CPU_PAT,      CPU_C2_AT_XE            },
-       { 0x000002FF, 0x000002FF, CPU_MTRR,     CPU_P6_CX_XE            },
-
-       { 0x00000300, 0x00000308, CPU_PMC,      CPU_INTEL_XEON          },
-       { 0x00000309, 0x0000030B, CPU_PMC,      CPU_C2_AT_XE            },
-       { 0x0000030C, 0x00000311, CPU_PMC,      CPU_INTEL_XEON          },
-       { 0x00000345, 0x00000345, CPU_PMC,      CPU_C2_AT               },
-       { 0x00000360, 0x00000371, CPU_PMC,      CPU_INTEL_XEON          },
-       { 0x0000038D, 0x00000390, CPU_PMC,      CPU_C2_AT               },
-       { 0x000003A0, 0x000003BE, CPU_PMC,      CPU_INTEL_XEON          },
-       { 0x000003C0, 0x000003CD, CPU_PMC,      CPU_INTEL_XEON          },
-       { 0x000003E0, 0x000003E1, CPU_PMC,      CPU_INTEL_XEON          },
-       { 0x000003F0, 0x000003F0, CPU_PMC,      CPU_INTEL_XEON          },
-       { 0x000003F1, 0x000003F1, CPU_PMC,      CPU_C2_AT_XE            },
-       { 0x000003F2, 0x000003F2, CPU_PMC,      CPU_INTEL_XEON          },
-
-       { 0x00000400, 0x00000402, CPU_MC,       CPU_PM_CX_AT_XE         },
-       { 0x00000403, 0x00000403, CPU_MC,       CPU_INTEL_XEON          },
-       { 0x00000404, 0x00000406, CPU_MC,       CPU_PM_CX_AT_XE         },
-       { 0x00000407, 0x00000407, CPU_MC,       CPU_INTEL_XEON          },
-       { 0x00000408, 0x0000040A, CPU_MC,       CPU_PM_CX_AT_XE         },
-       { 0x0000040B, 0x0000040B, CPU_MC,       CPU_INTEL_XEON          },
-       { 0x0000040C, 0x0000040E, CPU_MC,       CPU_PM_CX_XE            },
-       { 0x0000040F, 0x0000040F, CPU_MC,       CPU_INTEL_XEON          },
-       { 0x00000410, 0x00000412, CPU_MC,       CPU_PM_CX_AT_XE         },
-       { 0x00000413, 0x00000417, CPU_MC,       CPU_CX_AT_XE            },
-       { 0x00000480, 0x0000048B, CPU_VMX,      CPU_CX_AT_XE            },
-
-       { 0x00000600, 0x00000600, CPU_DEBUG,    CPU_PM_CX_AT_XE         },
-       { 0x00000680, 0x0000068F, CPU_LBRANCH,  CPU_INTEL_XEON          },
-       { 0x000006C0, 0x000006CF, CPU_LBRANCH,  CPU_INTEL_XEON          },
-
-       { 0x000107CC, 0x000107D3, CPU_PMC,      CPU_INTEL_XEON_MP       },
-
-       { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON          },
-       { 0xC0000081, 0xC0000082, CPU_CALL,     CPU_INTEL_XEON          },
-       { 0xC0000084, 0xC0000084, CPU_CALL,     CPU_INTEL_XEON          },
-       { 0xC0000100, 0xC0000102, CPU_BASE,     CPU_INTEL_XEON          },
+/* CPU Registers Range */
+static struct cpu_debug_range cpu_reg_range[] = {
+       { 0x00000000, 0x00000001, CPU_MC,       },
+       { 0x00000006, 0x00000007, CPU_MONITOR,  },
+       { 0x00000010, 0x00000010, CPU_TIME,     },
+       { 0x00000011, 0x00000013, CPU_PMC,      },
+       { 0x00000017, 0x00000017, CPU_PLATFORM, },
+       { 0x0000001B, 0x0000001B, CPU_APIC,     },
+       { 0x0000002A, 0x0000002B, CPU_POWERON,  },
+       { 0x0000002C, 0x0000002C, CPU_FREQ,     },
+       { 0x0000003A, 0x0000003A, CPU_CONTROL,  },
+       { 0x00000040, 0x00000047, CPU_LBRANCH,  },
+       { 0x00000060, 0x00000067, CPU_LBRANCH,  },
+       { 0x00000079, 0x00000079, CPU_BIOS,     },
+       { 0x00000088, 0x0000008A, CPU_CACHE,    },
+       { 0x0000008B, 0x0000008B, CPU_BIOS,     },
+       { 0x0000009B, 0x0000009B, CPU_MONITOR,  },
+       { 0x000000C1, 0x000000C4, CPU_PMC,      },
+       { 0x000000CD, 0x000000CD, CPU_FREQ,     },
+       { 0x000000E7, 0x000000E8, CPU_PERF,     },
+       { 0x000000FE, 0x000000FE, CPU_MTRR,     },
+
+       { 0x00000116, 0x0000011E, CPU_CACHE,    },
+       { 0x00000174, 0x00000176, CPU_SYSENTER, },
+       { 0x00000179, 0x0000017B, CPU_MC,       },
+       { 0x00000186, 0x00000189, CPU_PMC,      },
+       { 0x00000198, 0x00000199, CPU_PERF,     },
+       { 0x0000019A, 0x0000019A, CPU_TIME,     },
+       { 0x0000019B, 0x0000019D, CPU_THERM,    },
+       { 0x000001A0, 0x000001A0, CPU_MISC,     },
+       { 0x000001C9, 0x000001C9, CPU_LBRANCH,  },
+       { 0x000001D7, 0x000001D8, CPU_LBRANCH,  },
+       { 0x000001D9, 0x000001D9, CPU_DEBUG,    },
+       { 0x000001DA, 0x000001E0, CPU_LBRANCH,  },
+
+       { 0x00000200, 0x0000020F, CPU_MTRR,     },
+       { 0x00000250, 0x00000250, CPU_MTRR,     },
+       { 0x00000258, 0x00000259, CPU_MTRR,     },
+       { 0x00000268, 0x0000026F, CPU_MTRR,     },
+       { 0x00000277, 0x00000277, CPU_PAT,      },
+       { 0x000002FF, 0x000002FF, CPU_MTRR,     },
+
+       { 0x00000300, 0x00000311, CPU_PMC,      },
+       { 0x00000345, 0x00000345, CPU_PMC,      },
+       { 0x00000360, 0x00000371, CPU_PMC,      },
+       { 0x0000038D, 0x00000390, CPU_PMC,      },
+       { 0x000003A0, 0x000003BE, CPU_PMC,      },
+       { 0x000003C0, 0x000003CD, CPU_PMC,      },
+       { 0x000003E0, 0x000003E1, CPU_PMC,      },
+       { 0x000003F0, 0x000003F2, CPU_PMC,      },
+
+       { 0x00000400, 0x00000417, CPU_MC,       },
+       { 0x00000480, 0x0000048B, CPU_VMX,      },
+
+       { 0x00000600, 0x00000600, CPU_DEBUG,    },
+       { 0x00000680, 0x0000068F, CPU_LBRANCH,  },
+       { 0x000006C0, 0x000006CF, CPU_LBRANCH,  },
+
+       { 0x000107CC, 0x000107D3, CPU_PMC,      },
+
+       { 0xC0000080, 0xC0000080, CPU_FEATURES, },
+       { 0xC0000081, 0xC0000084, CPU_CALL,     },
+       { 0xC0000100, 0xC0000102, CPU_BASE,     },
+       { 0xC0000103, 0xC0000103, CPU_TIME,     },
+
+       { 0xC0010000, 0xC0010007, CPU_PMC,      },
+       { 0xC0010010, 0xC0010010, CPU_CONF,     },
+       { 0xC0010015, 0xC0010015, CPU_CONF,     },
+       { 0xC0010016, 0xC001001A, CPU_MTRR,     },
+       { 0xC001001D, 0xC001001D, CPU_MTRR,     },
+       { 0xC001001F, 0xC001001F, CPU_CONF,     },
+       { 0xC0010030, 0xC0010035, CPU_BIOS,     },
+       { 0xC0010044, 0xC0010048, CPU_MC,       },
+       { 0xC0010050, 0xC0010056, CPU_SMM,      },
+       { 0xC0010058, 0xC0010058, CPU_CONF,     },
+       { 0xC0010060, 0xC0010060, CPU_CACHE,    },
+       { 0xC0010061, 0xC0010068, CPU_SMM,      },
+       { 0xC0010069, 0xC001006B, CPU_SMM,      },
+       { 0xC0010070, 0xC0010071, CPU_SMM,      },
+       { 0xC0010111, 0xC0010113, CPU_SMM,      },
+       { 0xC0010114, 0xC0010118, CPU_SVM,      },
+       { 0xC0010140, 0xC0010141, CPU_OSVM,     },
+       { 0xC0011022, 0xC0011023, CPU_CONF,     },
  };
  
-/* AMD Registers Range */
-static struct cpu_debug_range cpu_amd_range[] = {
-       { 0x00000000, 0x00000001, CPU_MC,       CPU_K10_PLUS,           },
-       { 0x00000010, 0x00000010, CPU_TIME,     CPU_K8_PLUS,            },
-       { 0x0000001B, 0x0000001B, CPU_APIC,     CPU_K8_PLUS,            },
-       { 0x0000002A, 0x0000002A, CPU_POWERON,  CPU_K7_PLUS             },
-       { 0x0000008B, 0x0000008B, CPU_VER,      CPU_K8_PLUS             },
-       { 0x000000FE, 0x000000FE, CPU_MTRR,     CPU_K8_PLUS,            },
-
-       { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS,            },
-       { 0x00000179, 0x0000017B, CPU_MC,       CPU_K8_PLUS,            },
-       { 0x000001D9, 0x000001D9, CPU_DEBUG,    CPU_K8_PLUS,            },
-       { 0x000001DB, 0x000001DE, CPU_LBRANCH,  CPU_K8_PLUS,            },
-
-       { 0x00000200, 0x0000020F, CPU_MTRR,     CPU_K8_PLUS,            },
-       { 0x00000250, 0x00000250, CPU_MTRR,     CPU_K8_PLUS,            },
-       { 0x00000258, 0x00000259, CPU_MTRR,     CPU_K8_PLUS,            },
-       { 0x00000268, 0x0000026F, CPU_MTRR,     CPU_K8_PLUS,            },
-       { 0x00000277, 0x00000277, CPU_PAT,      CPU_K8_PLUS,            },
-       { 0x000002FF, 0x000002FF, CPU_MTRR,     CPU_K8_PLUS,            },
-
-       { 0x00000400, 0x00000413, CPU_MC,       CPU_K8_PLUS,            },
-
-       { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL,            },
-       { 0xC0000081, 0xC0000084, CPU_CALL,     CPU_K8_PLUS,            },
-       { 0xC0000100, 0xC0000102, CPU_BASE,     CPU_K8_PLUS,            },
-       { 0xC0000103, 0xC0000103, CPU_TIME,     CPU_K10_PLUS,           },
-
-       { 0xC0010000, 0xC0010007, CPU_PMC,      CPU_K8_PLUS,            },
-       { 0xC0010010, 0xC0010010, CPU_CONF,     CPU_K7_PLUS,            },
-       { 0xC0010015, 0xC0010015, CPU_CONF,     CPU_K7_PLUS,            },
-       { 0xC0010016, 0xC001001A, CPU_MTRR,     CPU_K8_PLUS,            },
-       { 0xC001001D, 0xC001001D, CPU_MTRR,     CPU_K8_PLUS,            },
-       { 0xC001001F, 0xC001001F, CPU_CONF,     CPU_K8_PLUS,            },
-       { 0xC0010030, 0xC0010035, CPU_BIOS,     CPU_K8_PLUS,            },
-       { 0xC0010044, 0xC0010048, CPU_MC,       CPU_K8_PLUS,            },
-       { 0xC0010050, 0xC0010056, CPU_SMM,      CPU_K0F_PLUS,           },
-       { 0xC0010058, 0xC0010058, CPU_CONF,     CPU_K10_PLUS,           },
-       { 0xC0010060, 0xC0010060, CPU_CACHE,    CPU_AMD_11,             },
-       { 0xC0010061, 0xC0010068, CPU_SMM,      CPU_K10_PLUS,           },
-       { 0xC0010069, 0xC001006B, CPU_SMM,      CPU_AMD_11,             },
-       { 0xC0010070, 0xC0010071, CPU_SMM,      CPU_K10_PLUS,           },
-       { 0xC0010111, 0xC0010113, CPU_SMM,      CPU_K8_PLUS,            },
-       { 0xC0010114, 0xC0010118, CPU_SVM,      CPU_K10_PLUS,           },
-       { 0xC0010140, 0xC0010141, CPU_OSVM,     CPU_K10_PLUS,           },
-       { 0xC0011022, 0xC0011023, CPU_CONF,     CPU_K10_PLUS,           },
-};
-
-
-/* Intel */
-static int get_intel_modelflag(unsigned model)
-{
-       int flag;
-
-       switch (model) {
-       case 0x0501:
-       case 0x0502:
-       case 0x0504:
-               flag = CPU_INTEL_PENTIUM;
-               break;
-       case 0x0601:
-       case 0x0603:
-       case 0x0605:
-       case 0x0607:
-       case 0x0608:
-       case 0x060A:
-       case 0x060B:
-               flag = CPU_INTEL_P6;
-               break;
-       case 0x0609:
-       case 0x060D:
-               flag = CPU_INTEL_PENTIUM_M;
-               break;
-       case 0x060E:
-               flag = CPU_INTEL_CORE;
-               break;
-       case 0x060F:
-       case 0x0617:
-               flag = CPU_INTEL_CORE2;
-               break;
-       case 0x061C:
-               flag = CPU_INTEL_ATOM;
-               break;
-       case 0x0F00:
-       case 0x0F01:
-       case 0x0F02:
-       case 0x0F03:
-       case 0x0F04:
-               flag = CPU_INTEL_XEON_P4;
-               break;
-       case 0x0F06:
-               flag = CPU_INTEL_XEON_MP;
-               break;
-       default:
-               flag = CPU_NONE;
-               break;
-       }
-
-       return flag;
-}
-
-/* AMD */
-static int get_amd_modelflag(unsigned model)
-{
-       int flag;
-
-       switch (model >> 8) {
-       case 0x6:
-               flag = CPU_AMD_K6;
-               break;
-       case 0x7:
-               flag = CPU_AMD_K7;
-               break;
-       case 0x8:
-               flag = CPU_AMD_K8;
-               break;
-       case 0xf:
-               flag = CPU_AMD_0F;
-               break;
-       case 0x10:
-               flag = CPU_AMD_10;
-               break;
-       case 0x11:
-               flag = CPU_AMD_11;
-               break;
-       default:
-               flag = CPU_NONE;
-               break;
-       }
-
-       return flag;
-}
-
-static int get_cpu_modelflag(unsigned cpu)
-{
-       int flag;
-
-       flag = per_cpu(cpu_model, cpu);
-
-       switch (flag >> 16) {
-       case X86_VENDOR_INTEL:
-               flag = get_intel_modelflag(flag);
-               break;
-       case X86_VENDOR_AMD:
-               flag = get_amd_modelflag(flag & 0xffff);
-               break;
-       default:
-               flag = CPU_NONE;
-               break;
-       }
-
-       return flag;
-}
-
-static int get_cpu_range_count(unsigned cpu)
-{
-       int index;
-
-       switch (per_cpu(cpu_model, cpu) >> 16) {
-       case X86_VENDOR_INTEL:
-               index = ARRAY_SIZE(cpu_intel_range);
-               break;
-       case X86_VENDOR_AMD:
-               index = ARRAY_SIZE(cpu_amd_range);
-               break;
-       default:
-               index = 0;
-               break;
-       }
-
-       return index;
-}
-
  static int is_typeflag_valid(unsigned cpu, unsigned flag)
  {
-       unsigned vendor, modelflag;
-       int i, index;
+       int i;
  
         /* Standard Registers should be always valid */
         if (flag >= CPU_TSS)
                 return 1;
  
-       modelflag = per_cpu(cpu_modelflag, cpu);
-       vendor = per_cpu(cpu_model, cpu) >> 16;
-       index = get_cpu_range_count(cpu);
-
-       for (i = 0; i < index; i++) {
-               switch (vendor) {
-               case X86_VENDOR_INTEL:
-                       if ((cpu_intel_range[i].model & modelflag) &&
-                           (cpu_intel_range[i].flag & flag))
-                               return 1;
-                       break;
-               case X86_VENDOR_AMD:
-                       if ((cpu_amd_range[i].model & modelflag) &&
-                           (cpu_amd_range[i].flag & flag))
-                               return 1;
-                       break;
-               }
+       for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
+               if (cpu_reg_range[i].flag == flag)
+                       return 1;
         }
  
         /* Invalid */
@@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag)
  static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
                               int index, unsigned flag)
  {
-       unsigned modelflag;
-
-       modelflag = per_cpu(cpu_modelflag, cpu);
-       *max = 0;
-       switch (per_cpu(cpu_model, cpu) >> 16) {
-       case X86_VENDOR_INTEL:
-               if ((cpu_intel_range[index].model & modelflag) &&
-                   (cpu_intel_range[index].flag & flag)) {
-                       *min = cpu_intel_range[index].min;
-                       *max = cpu_intel_range[index].max;
-               }
-               break;
-       case X86_VENDOR_AMD:
-               if ((cpu_amd_range[index].model & modelflag) &&
-                   (cpu_amd_range[index].flag & flag)) {
-                       *min = cpu_amd_range[index].min;
-                       *max = cpu_amd_range[index].max;
-               }
-               break;
-       }
+       if (cpu_reg_range[index].flag == flag) {
+               *min = cpu_reg_range[index].min;
+               *max = cpu_reg_range[index].max;
+       } else
+               *max = 0;
  
         return *max;
  }
@@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
         unsigned msr, msr_min, msr_max;
         struct cpu_private *priv;
         u32 low, high;
-       int i, range;
+       int i;
  
         if (seq) {
                 priv = seq->private;
@@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
                 }
         }
  
-       range = get_cpu_range_count(cpu);
-
-       for (i = 0; i < range; i++) {
+       for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
                 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
                         continue;
  
@@ -588,8 +369,20 @@ static void print_apic(void *arg)
         seq_printf(seq, " TMICT\t\t: %08x\n",  apic_read(APIC_TMICT));
         seq_printf(seq, " TMCCT\t\t: %08x\n",  apic_read(APIC_TMCCT));
         seq_printf(seq, " TDCR\t\t: %08x\n",  apic_read(APIC_TDCR));
-#endif /* CONFIG_X86_LOCAL_APIC */
+       if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+               unsigned int i, v, maxeilvt;
+
+               v = apic_read(APIC_EFEAT);
+               maxeilvt = (v >> 16) & 0xff;
+               seq_printf(seq, " EFEAT\t\t: %08x\n", v);
+               seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
  
+               for (i = 0; i < maxeilvt; i++) {
+                       v = apic_read(APIC_EILVTn(i));
+                       seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
+               }
+       }
+#endif /* CONFIG_X86_LOCAL_APIC */
         seq_printf(seq, "\n MSR\t:\n");
  }
  
@@ -788,13 +581,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
  {
         struct dentry *cpu_dentry = NULL;
         unsigned reg, reg_min, reg_max;
-       int i, range, err = 0;
+       int i, err = 0;
         char reg_dir[12];
         u32 low, high;
  
-       range = get_cpu_range_count(cpu);
-
-       for (i = 0; i < range; i++) {
+       for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
                 if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
                                    cpu_base[type].flag))
                         continue;
@@ -850,10 +641,6 @@ static int cpu_init_cpu(void)
                 cpui = &cpu_data(cpu);
                 if (!cpu_has(cpui, X86_FEATURE_MSR))
                         continue;
-               per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
-                                          (cpui->x86 << 8) |
-                                          (cpui->x86_model));
-               per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
  
                 sprintf(cpu_dir, "cpu%d", cpu);
                 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig

index 52c839875478571711df3d861ffc1a3d4c5eb413..f138c6c389b921628575575f6a2b0c115441f6c0 100644 (file)
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -220,11 +220,14 @@ config X86_LONGHAUL
           If in doubt, say N.
  
  config X86_E_POWERSAVER
-       tristate "VIA C7 Enhanced PowerSaver"
+       tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
         select CPU_FREQ_TABLE
-       depends on X86_32
+       depends on X86_32 && EXPERIMENTAL
         help
-         This adds the CPUFreq driver for VIA C7 processors.
+         This adds the CPUFreq driver for VIA C7 processors.  However, this driver
+         does not have any safeguards to prevent operating the CPU out of spec
+         and is thus considered dangerous.  Please use the regular ACPI cpufreq
+         driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
  
           If in doubt, say N.
  
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c

index 752e8c6b2c7e29e903895a4abe541b6e186b7add..ae9b503220cafa62e818afd4bfcf3b849a86c4b3 100644 (file)
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid)
  {
         struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
  
-       if (cpu->x86_vendor != X86_VENDOR_INTEL ||
-           !cpu_has(cpu, X86_FEATURE_EST))
-               return 0;
-
-       return 1;
+       return cpu_has(cpu, X86_FEATURE_EST);
  }
  
  static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c

index 7437fa133c02dfdde137ca7201fb9734888a85bc..daed39ba2614dbcad31e4725b343d56d0912f05d 100644 (file)
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -229,12 +229,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
  }
  #endif
  
-static void __cpuinit srat_detect_node(void)
+static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
  {
  #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
         unsigned node;
         int cpu = smp_processor_id();
-       int apicid = hard_smp_processor_id();
+       int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
  
         /* Don't do the funky fallback heuristics the AMD version employs
            for now. */
@@ -400,7 +400,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
         }
  
         /* Work around errata */
-       srat_detect_node();
+       srat_detect_node(c);
  
         if (cpu_has(c, X86_FEATURE_VMX))
                 detect_vmx_virtcap(c);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c

index 483eda96e102062b23f3e29820d911a9c7d6ab59..789efe217e1ab89a8862df2387a980d2cca9a60a 100644 (file)
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,6 +17,7 @@
  
  #include <asm/processor.h>
  #include <asm/smp.h>
+#include <asm/k8.h>
  
  #define LVL_1_INST     1
  #define LVL_1_DATA     2
@@ -159,14 +160,6 @@ struct _cpuid4_info_regs {
         unsigned long can_disable;
  };
  
-#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS)
-static struct pci_device_id k8_nb_id[] = {
-       { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
-       { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
-       {}
-};
-#endif
-
  unsigned short                 num_cache_leaves;
  
  /* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -207,10 +200,17 @@ union l3_cache {
  };
  
  static const unsigned short __cpuinitconst assocs[] = {
-       [1] = 1, [2] = 2, [4] = 4, [6] = 8,
-       [8] = 16, [0xa] = 32, [0xb] = 48,
+       [1] = 1,
+       [2] = 2,
+       [4] = 4,
+       [6] = 8,
+       [8] = 16,
+       [0xa] = 32,
+       [0xb] = 48,
         [0xc] = 64,
-       [0xf] = 0xffff // ??
+       [0xd] = 96,
+       [0xe] = 128,
+       [0xf] = 0xffff /* fully associative - no way to show this currently */
  };
  
  static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
@@ -271,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
         eax->split.type = types[leaf];
         eax->split.level = levels[leaf];
         if (leaf == 3)
-               eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1;
+               eax->split.num_threads_sharing =
+                       current_cpu_data.x86_max_cores - 1;
         else
                 eax->split.num_threads_sharing = 0;
         eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
@@ -291,6 +292,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
  {
         if (index < 3)
                 return;
+
+       if (boot_cpu_data.x86 == 0x11)
+               return;
+
+       /* see erratum #382 */
+       if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8))
+               return;
+
         this_leaf->can_disable = 1;
  }
  
@@ -696,97 +705,75 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
  #define to_object(k)   container_of(k, struct _index_kobject, kobj)
  #define to_attr(a)     container_of(a, struct _cache_attr, attr)
  
-#ifdef CONFIG_PCI
-static struct pci_dev *get_k8_northbridge(int node)
-{
-       struct pci_dev *dev = NULL;
-       int i;
-
-       for (i = 0; i <= node; i++) {
-               do {
-                       dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
-                       if (!dev)
-                               break;
-               } while (!pci_match_id(&k8_nb_id[0], dev));
-               if (!dev)
-                       break;
-       }
-       return dev;
-}
-#else
-static struct pci_dev *get_k8_northbridge(int node)
-{
-       return NULL;
-}
-#endif
-
-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
+static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
+                                 unsigned int index)
  {
-       const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
-       int node = cpu_to_node(cpumask_first(mask));
-       struct pci_dev *dev = NULL;
-       ssize_t ret = 0;
-       int i;
+       int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+       int node = cpu_to_node(cpu);
+       struct pci_dev *dev = node_to_k8_nb_misc(node);
+       unsigned int reg = 0;
  
         if (!this_leaf->can_disable)
-               return sprintf(buf, "Feature not enabled\n");
-
-       dev = get_k8_northbridge(node);
-       if (!dev) {
-               printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
                 return -EINVAL;
-       }
  
-       for (i = 0; i < 2; i++) {
-               unsigned int reg;
+       if (!dev)
+               return -EINVAL;
  
-               pci_read_config_dword(dev, 0x1BC + i * 4, &reg);
+       pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
+       return sprintf(buf, "%x\n", reg);
+}
  
-               ret += sprintf(buf, "%sEntry: %d\n", buf, i);
-               ret += sprintf(buf, "%sReads:  %s\tNew Entries: %s\n",  
-                       buf,
-                       reg & 0x80000000 ? "Disabled" : "Allowed",
-                       reg & 0x40000000 ? "Disabled" : "Allowed");
-               ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
-                       buf, (reg & 0x30000) >> 16, reg & 0xfff);
-       }
-       return ret;
+#define SHOW_CACHE_DISABLE(index)                                      \
+static ssize_t                                                         \
+show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf)          \
+{                                                                      \
+       return show_cache_disable(this_leaf, buf, index);               \
  }
+SHOW_CACHE_DISABLE(0)
+SHOW_CACHE_DISABLE(1)
  
-static ssize_t
-store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
-                   size_t count)
+static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
+       const char *buf, size_t count, unsigned int index)
  {
-       const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
-       int node = cpu_to_node(cpumask_first(mask));
-       struct pci_dev *dev = NULL;
-       unsigned int ret, index, val;
+       int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+       int node = cpu_to_node(cpu);
+       struct pci_dev *dev = node_to_k8_nb_misc(node);
+       unsigned long val = 0;
+       unsigned int scrubber = 0;
  
         if (!this_leaf->can_disable)
-               return 0;
-
-       if (strlen(buf) > 15)
                 return -EINVAL;
  
-       ret = sscanf(buf, "%x %x", &index, &val);
-       if (ret != 2)
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (!dev)
                 return -EINVAL;
-       if (index > 1)
+
+       if (strict_strtoul(buf, 10, &val) < 0)
                 return -EINVAL;
  
         val |= 0xc0000000;
-       dev = get_k8_northbridge(node);
-       if (!dev) {
-               printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
-               return -EINVAL;
-       }
+
+       pci_read_config_dword(dev, 0x58, &scrubber);
+       scrubber &= ~0x1f000000;
+       pci_write_config_dword(dev, 0x58, scrubber);
  
         pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
         wbinvd();
         pci_write_config_dword(dev, 0x1BC + index * 4, val);
+       return count;
+}
  
-       return 1;
+#define STORE_CACHE_DISABLE(index)                                     \
+static ssize_t                                                         \
+store_cache_disable_##index(struct _cpuid4_info *this_leaf,            \
+                           const char *buf, size_t count)              \
+{                                                                      \
+       return store_cache_disable(this_leaf, buf, count, index);       \
  }
+STORE_CACHE_DISABLE(0)
+STORE_CACHE_DISABLE(1)
  
  struct _cache_attr {
         struct attribute attr;
@@ -808,7 +795,10 @@ define_one_ro(size);
  define_one_ro(shared_cpu_map);
  define_one_ro(shared_cpu_list);
  
-static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable);
+static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
+               show_cache_disable_0, store_cache_disable_0);
+static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
+               show_cache_disable_1, store_cache_disable_1);
  
  static struct attribute * default_attrs[] = {
         &type.attr,
@@ -820,7 +810,8 @@ static struct attribute * default_attrs[] = {
         &size.attr,
         &shared_cpu_map.attr,
         &shared_cpu_list.attr,
-       &cache_disable.attr,
+       &cache_disable_0.attr,
+       &cache_disable_1.attr,
         NULL
  };
  
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c

index cef3ee30744b9964c9503f1ffd137c7d5c9aaf69..65a0fceedcd77ce7ed0088a5b24b0432caebd990 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -15,7 +15,6 @@
  #include <asm/hw_irq.h>
  #include <asm/idle.h>
  #include <asm/therm_throt.h>
-#include <asm/apic.h>
  
  asmlinkage void smp_thermal_interrupt(void)
  {
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c

index ce0fe4b5c04f6cfef4c14109124d55bd13f90cbe..1d584a18a50dab20fbe35e21e10859db05f2fe39 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits)
  
         if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
                 return 0;
-       rdmsr(MTRRdefType_MSR, def, dummy);
+       rdmsr(MSR_MTRRdefType, def, dummy);
         def &= 0xff;
         if (def != MTRR_TYPE_UNCACHABLE)
                 return 0;
@@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
          */
         if (!is_cpu(INTEL) || disable_mtrr_trim)
                 return 0;
-       rdmsr(MTRRdefType_MSR, def, dummy);
+       rdmsr(MSR_MTRRdefType, def, dummy);
         def &= 0xff;
         if (def != MTRR_TYPE_UNCACHABLE)
                 return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c

index d21d4fb161f70f43e91d229e94706567fbf3954e..0543f69f0b270c4ec6a4edf9f603df9f8228b0ea 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -20,9 +20,9 @@ struct fixed_range_block {
  };
  
  static struct fixed_range_block fixed_range_blocks[] = {
-       { MTRRfix64K_00000_MSR, 1 }, /* one  64k MTRR  */
-       { MTRRfix16K_80000_MSR, 2 }, /* two  16k MTRRs */
-       { MTRRfix4K_C0000_MSR,  8 }, /* eight 4k MTRRs */
+       { MSR_MTRRfix64K_00000, 1 }, /* one  64k MTRR  */
+       { MSR_MTRRfix16K_80000, 2 }, /* two  16k MTRRs */
+       { MSR_MTRRfix4K_C0000,  8 }, /* eight 4k MTRRs */
         {}
  };
  
@@ -194,12 +194,12 @@ get_fixed_ranges(mtrr_type * frs)
  
         k8_check_syscfg_dram_mod_en();
  
-       rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
+       rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);
  
         for (i = 0; i < 2; i++)
-               rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]);
+               rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
         for (i = 0; i < 8; i++)
-               rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
+               rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);
  }
  
  void mtrr_save_fixed_ranges(void *info)
@@ -310,7 +310,7 @@ void __init get_mtrr_state(void)
  
         vrs = mtrr_state.var_ranges;
  
-       rdmsr(MTRRcap_MSR, lo, dummy);
+       rdmsr(MSR_MTRRcap, lo, dummy);
         mtrr_state.have_fixed = (lo >> 8) & 1;
  
         for (i = 0; i < num_var_ranges; i++)
@@ -318,7 +318,7 @@ void __init get_mtrr_state(void)
         if (mtrr_state.have_fixed)
                 get_fixed_ranges(mtrr_state.fixed_ranges);
  
-       rdmsr(MTRRdefType_MSR, lo, dummy);
+       rdmsr(MSR_MTRRdefType, lo, dummy);
         mtrr_state.def_type = (lo & 0xff);
         mtrr_state.enabled = (lo & 0xc00) >> 10;
  
@@ -583,10 +583,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
         __flush_tlb();
  
         /*  Save MTRR state */
-       rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
+       rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
  
         /*  Disable MTRRs, and set the default type to uncached  */
-       mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi);
+       mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
  }
  
  static void post_set(void) __releases(set_atomicity_lock)
@@ -595,7 +595,7 @@ static void post_set(void) __releases(set_atomicity_lock)
         __flush_tlb();
  
         /* Intel (P6) standard MTRRs */
-       mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
+       mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
                 
         /*  Enable caches  */
         write_cr0(read_cr0() & 0xbfffffff);
@@ -707,7 +707,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
  static int generic_have_wrcomb(void)
  {
         unsigned long config, dummy;
-       rdmsr(MTRRcap_MSR, config, dummy);
+       rdmsr(MSR_MTRRcap, config, dummy);
         return (config & (1 << 10));
  }
  
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c

index 03cda01f57c7125173a8b348d92105d5bcd5dd28..8fc248b5aeafe26a0f822350bf9488685b9eab4e 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void)
         unsigned long config = 0, dummy;
  
         if (use_intel()) {
-               rdmsr(MTRRcap_MSR, config, dummy);
+               rdmsr(MSR_MTRRcap, config, dummy);
         } else if (is_cpu(AMD))
                 config = 2;
         else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h

index 77f67f7b347a3a97a093b1094c5da86a2ca9ad35..7538b767f2060ed4727bb992f665b87e629b1bc5 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -5,21 +5,6 @@
  #include <linux/types.h>
  #include <linux/stddef.h>
  
-#define MTRRcap_MSR     0x0fe
-#define MTRRdefType_MSR 0x2ff
-
-#define MTRRfix64K_00000_MSR 0x250
-#define MTRRfix16K_80000_MSR 0x258
-#define MTRRfix16K_A0000_MSR 0x259
-#define MTRRfix4K_C0000_MSR 0x268
-#define MTRRfix4K_C8000_MSR 0x269
-#define MTRRfix4K_D0000_MSR 0x26a
-#define MTRRfix4K_D8000_MSR 0x26b
-#define MTRRfix4K_E0000_MSR 0x26c
-#define MTRRfix4K_E8000_MSR 0x26d
-#define MTRRfix4K_F0000_MSR 0x26e
-#define MTRRfix4K_F8000_MSR 0x26f
-
  #define MTRR_CHANGE_MASK_FIXED     0x01
  #define MTRR_CHANGE_MASK_VARIABLE  0x02
  #define MTRR_CHANGE_MASK_DEFTYPE   0x04
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c

index 7f7e2753685bce875ccd6dca8903474e366456cc..1f5fb1588d1fb4df0bbaefc4196b23a6056140aa 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
  
                 if (use_intel())
                         /*  Save MTRR state */
-                       rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
+                       rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
                 else
                         /* Cyrix ARRs - everything else were excluded at the top */
                         ctxt->ccr3 = getCx86(CX86_CCR3);
@@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
  {
         if (use_intel())
                 /*  Disable MTRRs, and set the default type to uncached  */
-               mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL,
+               mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
                       ctxt->deftype_hi);
         else if (is_cpu(CYRIX))
                 /* Cyrix ARRs - everything else were excluded at the top */
@@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt)
                 /*  Restore MTRRdefType  */
                 if (use_intel())
                         /* Intel (P6) standard MTRRs */
-                       mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
+                       mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
                 else
                         /* Cyrix ARRs - everything else was excluded at the top */
                         setCx86(CX86_CCR3, ctxt->ccr3);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c

new file mode 100644 (file)

index 0000000..895c82e
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,1704 @@
+/*
+ * Performance counter x86 architecture code
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_counter.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+
+#include <asm/apic.h>
+#include <asm/stacktrace.h>
+#include <asm/nmi.h>
+
+static u64 perf_counter_mask __read_mostly;
+
+struct cpu_hw_counters {
+       struct perf_counter     *counters[X86_PMC_IDX_MAX];
+       unsigned long           used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+       unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+       unsigned long           interrupts;
+       int                     enabled;
+};
+
+/*
+ * struct x86_pmu - generic x86 pmu
+ */
+struct x86_pmu {
+       const char      *name;
+       int             version;
+       int             (*handle_irq)(struct pt_regs *);
+       void            (*disable_all)(void);
+       void            (*enable_all)(void);
+       void            (*enable)(struct hw_perf_counter *, int);
+       void            (*disable)(struct hw_perf_counter *, int);
+       unsigned        eventsel;
+       unsigned        perfctr;
+       u64             (*event_map)(int);
+       u64             (*raw_event)(u64);
+       int             max_events;
+       int             num_counters;
+       int             num_counters_fixed;
+       int             counter_bits;
+       u64             counter_mask;
+       u64             max_period;
+       u64             intel_ctrl;
+};
+
+static struct x86_pmu x86_pmu __read_mostly;
+
+static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
+       .enabled = 1,
+};
+
+/*
+ * Intel PerfMon v3. Used on Core2 and later.
+ */
+static const u64 intel_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]           = 0x003c,
+  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x4f2e,
+  [PERF_COUNT_HW_CACHE_MISSES]         = 0x412e,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x00c5,
+  [PERF_COUNT_HW_BUS_CYCLES]           = 0x013c,
+};
+
+static u64 intel_pmu_event_map(int event)
+{
+       return intel_perfmon_event_map[event];
+}
+
+/*
+ * Generalized hw caching related event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'event makes no sense on
+ * this CPU', any other value means the raw event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+static u64 __read_mostly hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+static const u64 nehalem_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */
+               [ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */
+               [ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
+               [ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
+               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
+               [ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
+               [ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
+               [ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
+               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
+               [ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
+               [ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+               [ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+static const u64 core2_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
+               [ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
+               [ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
+               [ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+               [ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+static const u64 atom_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
+               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+               [ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+static u64 intel_pmu_raw_event(u64 event)
+{
+#define CORE_EVNTSEL_EVENT_MASK                0x000000FFULL
+#define CORE_EVNTSEL_UNIT_MASK         0x0000FF00ULL
+#define CORE_EVNTSEL_EDGE_MASK         0x00040000ULL
+#define CORE_EVNTSEL_INV_MASK          0x00800000ULL
+#define CORE_EVNTSEL_COUNTER_MASK      0xFF000000ULL
+
+#define CORE_EVNTSEL_MASK              \
+       (CORE_EVNTSEL_EVENT_MASK |      \
+        CORE_EVNTSEL_UNIT_MASK  |      \
+        CORE_EVNTSEL_EDGE_MASK  |      \
+        CORE_EVNTSEL_INV_MASK  |       \
+        CORE_EVNTSEL_COUNTER_MASK)
+
+       return event & CORE_EVNTSEL_MASK;
+}
+
+static const u64 amd_0f_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
+               [ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
+               [ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
+               [ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+/*
+ * AMD Performance Monitor K7 and later.
+ */
+static const u64 amd_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]           = 0x0076,
+  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x0080,
+  [PERF_COUNT_HW_CACHE_MISSES]         = 0x0081,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x00c5,
+};
+
+static u64 amd_pmu_event_map(int event)
+{
+       return amd_perfmon_event_map[event];
+}
+
+static u64 amd_pmu_raw_event(u64 event)
+{
+#define K7_EVNTSEL_EVENT_MASK  0x7000000FFULL
+#define K7_EVNTSEL_UNIT_MASK   0x00000FF00ULL
+#define K7_EVNTSEL_EDGE_MASK   0x000040000ULL
+#define K7_EVNTSEL_INV_MASK    0x000800000ULL
+#define K7_EVNTSEL_COUNTER_MASK        0x0FF000000ULL
+
+#define K7_EVNTSEL_MASK                        \
+       (K7_EVNTSEL_EVENT_MASK |        \
+        K7_EVNTSEL_UNIT_MASK  |        \
+        K7_EVNTSEL_EDGE_MASK  |        \
+        K7_EVNTSEL_INV_MASK   |        \
+        K7_EVNTSEL_COUNTER_MASK)
+
+       return event & K7_EVNTSEL_MASK;
+}
+
+/*
+ * Propagate counter elapsed time into the generic counter.
+ * Can only be executed on the CPU where the counter is active.
+ * Returns the delta events processed.
+ */
+static u64
+x86_perf_counter_update(struct perf_counter *counter,
+                       struct hw_perf_counter *hwc, int idx)
+{
+       int shift = 64 - x86_pmu.counter_bits;
+       u64 prev_raw_count, new_raw_count;
+       s64 delta;
+
+       /*
+        * Careful: an NMI might modify the previous counter value.
+        *
+        * Our tactic to handle this is to first atomically read and
+        * exchange a new raw count - then add that new-prev delta
+        * count to the generic counter atomically:
+        */
+again:
+       prev_raw_count = atomic64_read(&hwc->prev_count);
+       rdmsrl(hwc->counter_base + idx, new_raw_count);
+
+       if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                                       new_raw_count) != prev_raw_count)
+               goto again;
+
+       /*
+        * Now we have the new raw value and have updated the prev
+        * timestamp already. We can now calculate the elapsed delta
+        * (counter-)time and add that to the generic counter.
+        *
+        * Careful, not all hw sign-extends above the physical width
+        * of the count.
+        */
+       delta = (new_raw_count << shift) - (prev_raw_count << shift);
+       delta >>= shift;
+
+       atomic64_add(delta, &counter->count);
+       atomic64_sub(delta, &hwc->period_left);
+
+       return new_raw_count;
+}
+
+static atomic_t active_counters;
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+static bool reserve_pmc_hardware(void)
+{
+       int i;
+
+       if (nmi_watchdog == NMI_LOCAL_APIC)
+               disable_lapic_nmi_watchdog();
+
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
+                       goto perfctr_fail;
+       }
+
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
+                       goto eventsel_fail;
+       }
+
+       return true;
+
+eventsel_fail:
+       for (i--; i >= 0; i--)
+               release_evntsel_nmi(x86_pmu.eventsel + i);
+
+       i = x86_pmu.num_counters;
+
+perfctr_fail:
+       for (i--; i >= 0; i--)
+               release_perfctr_nmi(x86_pmu.perfctr + i);
+
+       if (nmi_watchdog == NMI_LOCAL_APIC)
+               enable_lapic_nmi_watchdog();
+
+       return false;
+}
+
+static void release_pmc_hardware(void)
+{
+       int i;
+
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               release_perfctr_nmi(x86_pmu.perfctr + i);
+               release_evntsel_nmi(x86_pmu.eventsel + i);
+       }
+
+       if (nmi_watchdog == NMI_LOCAL_APIC)
+               enable_lapic_nmi_watchdog();
+}
+
+static void hw_perf_counter_destroy(struct perf_counter *counter)
+{
+       if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
+               release_pmc_hardware();
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+}
+
+static inline int x86_pmu_initialized(void)
+{
+       return x86_pmu.handle_irq != NULL;
+}
+
+static inline int
+set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+       unsigned int cache_type, cache_op, cache_result;
+       u64 config, val;
+
+       config = attr->config;
+
+       cache_type = (config >>  0) & 0xff;
+       if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+               return -EINVAL;
+
+       cache_op = (config >>  8) & 0xff;
+       if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+               return -EINVAL;
+
+       cache_result = (config >> 16) & 0xff;
+       if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+               return -EINVAL;
+
+       val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+
+       if (val == 0)
+               return -ENOENT;
+
+       if (val == -1)
+               return -EINVAL;
+
+       hwc->config |= val;
+
+       return 0;
+}
+
+/*
+ * Setup the hardware configuration for a given attr_type
+ */
+static int __hw_perf_counter_init(struct perf_counter *counter)
+{
+       struct perf_counter_attr *attr = &counter->attr;
+       struct hw_perf_counter *hwc = &counter->hw;
+       int err;
+
+       if (!x86_pmu_initialized())
+               return -ENODEV;
+
+       err = 0;
+       if (!atomic_inc_not_zero(&active_counters)) {
+               mutex_lock(&pmc_reserve_mutex);
+               if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
+                       err = -EBUSY;
+               else
+                       atomic_inc(&active_counters);
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+       if (err)
+               return err;
+
+       /*
+        * Generate PMC IRQs:
+        * (keep 'enabled' bit clear for now)
+        */
+       hwc->config = ARCH_PERFMON_EVENTSEL_INT;
+
+       /*
+        * Count user and OS events unless requested not to.
+        */
+       if (!attr->exclude_user)
+               hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
+       if (!attr->exclude_kernel)
+               hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+
+       if (!hwc->sample_period) {
+               hwc->sample_period = x86_pmu.max_period;
+               hwc->last_period = hwc->sample_period;
+               atomic64_set(&hwc->period_left, hwc->sample_period);
+       }
+
+       counter->destroy = hw_perf_counter_destroy;
+
+       /*
+        * Raw event type provide the config in the event structure
+        */
+       if (attr->type == PERF_TYPE_RAW) {
+               hwc->config |= x86_pmu.raw_event(attr->config);
+               return 0;
+       }
+
+       if (attr->type == PERF_TYPE_HW_CACHE)
+               return set_ext_hw_attr(hwc, attr);
+
+       if (attr->config >= x86_pmu.max_events)
+               return -EINVAL;
+       /*
+        * The generic map:
+        */
+       hwc->config |= x86_pmu.event_map(attr->config);
+
+       return 0;
+}
+
+static void intel_pmu_disable_all(void)
+{
+       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+}
+
+static void amd_pmu_disable_all(void)
+{
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+       int idx;
+
+       if (!cpuc->enabled)
+               return;
+
+       cpuc->enabled = 0;
+       /*
+        * ensure we write the disable before we start disabling the
+        * counters proper, so that amd_pmu_enable_counter() does the
+        * right thing.
+        */
+       barrier();
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               u64 val;
+
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+               rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+               if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
+                       continue;
+               val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+               wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+       }
+}
+
+void hw_perf_disable(void)
+{
+       if (!x86_pmu_initialized())
+               return;
+       return x86_pmu.disable_all();
+}
+
+static void intel_pmu_enable_all(void)
+{
+       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+}
+
+static void amd_pmu_enable_all(void)
+{
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+       int idx;
+
+       if (cpuc->enabled)
+               return;
+
+       cpuc->enabled = 1;
+       barrier();
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               u64 val;
+
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+               rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+               if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
+                       continue;
+               val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+               wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+       }
+}
+
+void hw_perf_enable(void)
+{
+       if (!x86_pmu_initialized())
+               return;
+       x86_pmu.enable_all();
+}
+
+static inline u64 intel_pmu_get_status(void)
+{
+       u64 status;
+
+       rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+       return status;
+}
+
+static inline void intel_pmu_ack_status(u64 ack)
+{
+       wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+       int err;
+       err = checking_wrmsrl(hwc->config_base + idx,
+                             hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
+}
+
+static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+       int err;
+       err = checking_wrmsrl(hwc->config_base + idx,
+                             hwc->config);
+}
+
+static inline void
+intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
+{
+       int idx = __idx - X86_PMC_IDX_FIXED;
+       u64 ctrl_val, mask;
+       int err;
+
+       mask = 0xfULL << (idx * 4);
+
+       rdmsrl(hwc->config_base, ctrl_val);
+       ctrl_val &= ~mask;
+       err = checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static inline void
+intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+               intel_pmu_disable_fixed(hwc, idx);
+               return;
+       }
+
+       x86_pmu_disable_counter(hwc, idx);
+}
+
+static inline void
+amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+       x86_pmu_disable_counter(hwc, idx);
+}
+
+static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
+
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the counter disabled in hw:
+ */
+static int
+x86_perf_counter_set_period(struct perf_counter *counter,
+                            struct hw_perf_counter *hwc, int idx)
+{
+       s64 left = atomic64_read(&hwc->period_left);
+       s64 period = hwc->sample_period;
+       int err, ret = 0;
+
+       /*
+        * If we are way outside a reasoable range then just skip forward:
+        */
+       if (unlikely(left <= -period)) {
+               left = period;
+               atomic64_set(&hwc->period_left, left);
+               hwc->last_period = period;
+               ret = 1;
+       }
+
+       if (unlikely(left <= 0)) {
+               left += period;
+               atomic64_set(&hwc->period_left, left);
+               hwc->last_period = period;
+               ret = 1;
+       }
+       /*
+        * Quirk: certain CPUs dont like it if just 1 event is left:
+        */
+       if (unlikely(left < 2))
+               left = 2;
+
+       if (left > x86_pmu.max_period)
+               left = x86_pmu.max_period;
+
+       per_cpu(prev_left[idx], smp_processor_id()) = left;
+
+       /*
+        * The hw counter starts counting from this counter offset,
+        * mark it to be able to extra future deltas:
+        */
+       atomic64_set(&hwc->prev_count, (u64)-left);
+
+       err = checking_wrmsrl(hwc->counter_base + idx,
+                            (u64)(-left) & x86_pmu.counter_mask);
+
+       return ret;
+}
+
+static inline void
+intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
+{
+       int idx = __idx - X86_PMC_IDX_FIXED;
+       u64 ctrl_val, bits, mask;
+       int err;
+
+       /*
+        * Enable IRQ generation (0x8),
+        * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+        * if requested:
+        */
+       bits = 0x8ULL;
+       if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+               bits |= 0x2;
+       if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+               bits |= 0x1;
+       bits <<= (idx * 4);
+       mask = 0xfULL << (idx * 4);
+
+       rdmsrl(hwc->config_base, ctrl_val);
+       ctrl_val &= ~mask;
+       ctrl_val |= bits;
+       err = checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+               intel_pmu_enable_fixed(hwc, idx);
+               return;
+       }
+
+       x86_pmu_enable_counter(hwc, idx);
+}
+
+static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+       if (cpuc->enabled)
+               x86_pmu_enable_counter(hwc, idx);
+       else
+               x86_pmu_disable_counter(hwc, idx);
+}
+
+static int
+fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
+{
+       unsigned int event;
+
+       if (!x86_pmu.num_counters_fixed)
+               return -1;
+
+       event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+
+       if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
+               return X86_PMC_IDX_FIXED_INSTRUCTIONS;
+       if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
+               return X86_PMC_IDX_FIXED_CPU_CYCLES;
+       if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
+               return X86_PMC_IDX_FIXED_BUS_CYCLES;
+
+       return -1;
+}
+
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in counter:
+ */
+static int x86_pmu_enable(struct perf_counter *counter)
+{
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+       struct hw_perf_counter *hwc = &counter->hw;
+       int idx;
+
+       idx = fixed_mode_idx(counter, hwc);
+       if (idx >= 0) {
+               /*
+                * Try to get the fixed counter, if that is already taken
+                * then try to get a generic counter:
+                */
+               if (test_and_set_bit(idx, cpuc->used_mask))
+                       goto try_generic;
+
+               hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+               /*
+                * We set it so that counter_base + idx in wrmsr/rdmsr maps to
+                * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
+                */
+               hwc->counter_base =
+                       MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
+               hwc->idx = idx;
+       } else {
+               idx = hwc->idx;
+               /* Try to get the previous generic counter again */
+               if (test_and_set_bit(idx, cpuc->used_mask)) {
+try_generic:
+                       idx = find_first_zero_bit(cpuc->used_mask,
+                                                 x86_pmu.num_counters);
+                       if (idx == x86_pmu.num_counters)
+                               return -EAGAIN;
+
+                       set_bit(idx, cpuc->used_mask);
+                       hwc->idx = idx;
+               }
+               hwc->config_base  = x86_pmu.eventsel;
+               hwc->counter_base = x86_pmu.perfctr;
+       }
+
+       perf_counters_lapic_init();
+
+       x86_pmu.disable(hwc, idx);
+
+       cpuc->counters[idx] = counter;
+       set_bit(idx, cpuc->active_mask);
+
+       x86_perf_counter_set_period(counter, hwc, idx);
+       x86_pmu.enable(hwc, idx);
+
+       return 0;
+}
+
+static void x86_pmu_unthrottle(struct perf_counter *counter)
+{
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+       struct hw_perf_counter *hwc = &counter->hw;
+
+       if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
+                               cpuc->counters[hwc->idx] != counter))
+               return;
+
+       x86_pmu.enable(hwc, hwc->idx);
+}
+
+void perf_counter_print_debug(void)
+{
+       u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+       struct cpu_hw_counters *cpuc;
+       unsigned long flags;
+       int cpu, idx;
+
+       if (!x86_pmu.num_counters)
+               return;
+
+       local_irq_save(flags);
+
+       cpu = smp_processor_id();
+       cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+       if (x86_pmu.version >= 2) {
+               rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+               rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+               rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+               rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+
+               pr_info("\n");
+               pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
+               pr_info("CPU#%d: status:     %016llx\n", cpu, status);
+               pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
+               pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
+       }
+       pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask);
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
+               rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
+
+               prev_left = per_cpu(prev_left[idx], cpu);
+
+               pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
+                       cpu, idx, pmc_ctrl);
+               pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
+                       cpu, idx, pmc_count);
+               pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
+                       cpu, idx, prev_left);
+       }
+       for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+               rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
+
+               pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
+                       cpu, idx, pmc_count);
+       }
+       local_irq_restore(flags);
+}
+
+static void x86_pmu_disable(struct perf_counter *counter)
+{
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+       struct hw_perf_counter *hwc = &counter->hw;
+       int idx = hwc->idx;
+
+       /*
+        * Must be done before we disable, otherwise the nmi handler
+        * could reenable again:
+        */
+       clear_bit(idx, cpuc->active_mask);
+       x86_pmu.disable(hwc, idx);
+
+       /*
+        * Make sure the cleared pointer becomes visible before we
+        * (potentially) free the counter:
+        */
+       barrier();
+
+       /*
+        * Drain the remaining delta count out of a counter
+        * that we are disabling:
+        */
+       x86_perf_counter_update(counter, hwc, idx);
+       cpuc->counters[idx] = NULL;
+       clear_bit(idx, cpuc->used_mask);
+}
+
+/*
+ * Save and restart an expired counter. Called by NMI contexts,
+ * so it has to be careful about preempting normal counter ops:
+ */
+static int intel_pmu_save_and_restart(struct perf_counter *counter)
+{
+       struct hw_perf_counter *hwc = &counter->hw;
+       int idx = hwc->idx;
+       int ret;
+
+       x86_perf_counter_update(counter, hwc, idx);
+       ret = x86_perf_counter_set_period(counter, hwc, idx);
+
+       if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+               intel_pmu_enable_counter(hwc, idx);
+
+       return ret;
+}
+
+static void intel_pmu_reset(void)
+{
+       unsigned long flags;
+       int idx;
+
+       if (!x86_pmu.num_counters)
+               return;
+
+       local_irq_save(flags);
+
+       printk("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
+               checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
+       }
+       for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+               checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+       }
+
+       local_irq_restore(flags);
+}
+
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+       struct perf_sample_data data;
+       struct cpu_hw_counters *cpuc;
+       int bit, cpu, loops;
+       u64 ack, status;
+
+       data.regs = regs;
+       data.addr = 0;
+
+       cpu = smp_processor_id();
+       cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+       perf_disable();
+       status = intel_pmu_get_status();
+       if (!status) {
+               perf_enable();
+               return 0;
+       }
+
+       loops = 0;
+again:
+       if (++loops > 100) {
+               WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
+               perf_counter_print_debug();
+               intel_pmu_reset();
+               perf_enable();
+               return 1;
+       }
+
+       inc_irq_stat(apic_perf_irqs);
+       ack = status;
+       for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+               struct perf_counter *counter = cpuc->counters[bit];
+
+               clear_bit(bit, (unsigned long *) &status);
+               if (!test_bit(bit, cpuc->active_mask))
+                       continue;
+
+               if (!intel_pmu_save_and_restart(counter))
+                       continue;
+
+               if (perf_counter_overflow(counter, 1, &data))
+                       intel_pmu_disable_counter(&counter->hw, bit);
+       }
+
+       intel_pmu_ack_status(ack);
+
+       /*
+        * Repeat if there is more work to be done:
+        */
+       status = intel_pmu_get_status();
+       if (status)
+               goto again;
+
+       perf_enable();
+
+       return 1;
+}
+
+static int amd_pmu_handle_irq(struct pt_regs *regs)
+{
+       struct perf_sample_data data;
+       struct cpu_hw_counters *cpuc;
+       struct perf_counter *counter;
+       struct hw_perf_counter *hwc;
+       int cpu, idx, handled = 0;
+       u64 val;
+
+       data.regs = regs;
+       data.addr = 0;
+
+       cpu = smp_processor_id();
+       cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+
+               counter = cpuc->counters[idx];
+               hwc = &counter->hw;
+
+               val = x86_perf_counter_update(counter, hwc, idx);
+               if (val & (1ULL << (x86_pmu.counter_bits - 1)))
+                       continue;
+
+               /*
+                * counter overflow
+                */
+               handled         = 1;
+               data.period     = counter->hw.last_period;
+
+               if (!x86_perf_counter_set_period(counter, hwc, idx))
+                       continue;
+
+               if (perf_counter_overflow(counter, 1, &data))
+                       amd_pmu_disable_counter(hwc, idx);
+       }
+
+       if (handled)
+               inc_irq_stat(apic_perf_irqs);
+
+       return handled;
+}
+
+void smp_perf_pending_interrupt(struct pt_regs *regs)
+{
+       irq_enter();
+       ack_APIC_irq();
+       inc_irq_stat(apic_pending_irqs);
+       perf_counter_do_pending();
+       irq_exit();
+}
+
+void set_perf_counter_pending(void)
+{
+       apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+}
+
+void perf_counters_lapic_init(void)
+{
+       if (!x86_pmu_initialized())
+               return;
+
+       /*
+        * Always use NMI for PMU
+        */
+       apic_write(APIC_LVTPC, APIC_DM_NMI);
+}
+
+static int __kprobes
+perf_counter_nmi_handler(struct notifier_block *self,
+                        unsigned long cmd, void *__args)
+{
+       struct die_args *args = __args;
+       struct pt_regs *regs;
+
+       if (!atomic_read(&active_counters))
+               return NOTIFY_DONE;
+
+       switch (cmd) {
+       case DIE_NMI:
+       case DIE_NMI_IPI:
+               break;
+
+       default:
+               return NOTIFY_DONE;
+       }
+
+       regs = args->regs;
+
+       apic_write(APIC_LVTPC, APIC_DM_NMI);
+       /*
+        * Can't rely on the handled return value to say it was our NMI, two
+        * counters could trigger 'simultaneously' raising two back-to-back NMIs.
+        *
+        * If the first NMI handles both, the latter will be empty and daze
+        * the CPU.
+        */
+       x86_pmu.handle_irq(regs);
+
+       return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
+       .notifier_call          = perf_counter_nmi_handler,
+       .next                   = NULL,
+       .priority               = 1
+};
+
+static struct x86_pmu intel_pmu = {
+       .name                   = "Intel",
+       .handle_irq             = intel_pmu_handle_irq,
+       .disable_all            = intel_pmu_disable_all,
+       .enable_all             = intel_pmu_enable_all,
+       .enable                 = intel_pmu_enable_counter,
+       .disable                = intel_pmu_disable_counter,
+       .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
+       .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
+       .event_map              = intel_pmu_event_map,
+       .raw_event              = intel_pmu_raw_event,
+       .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
+       /*
+        * Intel PMCs cannot be accessed sanely above 32 bit width,
+        * so we install an artificial 1<<31 period regardless of
+        * the generic counter period:
+        */
+       .max_period             = (1ULL << 31) - 1,
+};
+
+static struct x86_pmu amd_pmu = {
+       .name                   = "AMD",
+       .handle_irq             = amd_pmu_handle_irq,
+       .disable_all            = amd_pmu_disable_all,
+       .enable_all             = amd_pmu_enable_all,
+       .enable                 = amd_pmu_enable_counter,
+       .disable                = amd_pmu_disable_counter,
+       .eventsel               = MSR_K7_EVNTSEL0,
+       .perfctr                = MSR_K7_PERFCTR0,
+       .event_map              = amd_pmu_event_map,
+       .raw_event              = amd_pmu_raw_event,
+       .max_events             = ARRAY_SIZE(amd_perfmon_event_map),
+       .num_counters           = 4,
+       .counter_bits           = 48,
+       .counter_mask           = (1ULL << 48) - 1,
+       /* use highest bit to detect overflow */
+       .max_period             = (1ULL << 47) - 1,
+};
+
+static int intel_pmu_init(void)
+{
+       union cpuid10_edx edx;
+       union cpuid10_eax eax;
+       unsigned int unused;
+       unsigned int ebx;
+       int version;
+
+       if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+               return -ENODEV;
+
+       /*
+        * Check whether the Architectural PerfMon supports
+        * Branch Misses Retired Event or not.
+        */
+       cpuid(10, &eax.full, &ebx, &unused, &edx.full);
+       if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
+               return -ENODEV;
+
+       version = eax.split.version_id;
+       if (version < 2)
+               return -ENODEV;
+
+       x86_pmu                         = intel_pmu;
+       x86_pmu.version                 = version;
+       x86_pmu.num_counters            = eax.split.num_counters;
+       x86_pmu.counter_bits            = eax.split.bit_width;
+       x86_pmu.counter_mask            = (1ULL << eax.split.bit_width) - 1;
+
+       /*
+        * Quirk: v2 perfmon does not report fixed-purpose counters, so
+        * assume at least 3 counters:
+        */
+       x86_pmu.num_counters_fixed      = max((int)edx.split.num_counters_fixed, 3);
+
+       rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
+       /*
+        * Install the hw-cache-events table:
+        */
+       switch (boot_cpu_data.x86_model) {
+       case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+       case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
+       case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
+       case 29: /* six-core 45 nm xeon "Dunnington" */
+               memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+
+               pr_cont("Core2 events, ");
+               break;
+       default:
+       case 26:
+               memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+
+               pr_cont("Nehalem/Corei7 events, ");
+               break;
+       case 28:
+               memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+
+               pr_cont("Atom events, ");
+               break;
+       }
+       return 0;
+}
+
+static int amd_pmu_init(void)
+{
+       x86_pmu = amd_pmu;
+
+       switch (boot_cpu_data.x86) {
+       case 0x0f:
+       case 0x10:
+       case 0x11:
+               memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+
+               pr_cont("AMD Family 0f/10/11 events, ");
+               break;
+       }
+       return 0;
+}
+
+void __init init_hw_perf_counters(void)
+{
+       int err;
+
+       pr_info("Performance Counters: ");
+
+       switch (boot_cpu_data.x86_vendor) {
+       case X86_VENDOR_INTEL:
+               err = intel_pmu_init();
+               break;
+       case X86_VENDOR_AMD:
+               err = amd_pmu_init();
+               break;
+       default:
+               return;
+       }
+       if (err != 0) {
+               pr_cont("no PMU driver, software counters only.\n");
+               return;
+       }
+
+       pr_cont("%s PMU driver.\n", x86_pmu.name);
+
+       if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
+               x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
+               WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
+                    x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
+       }
+       perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
+       perf_max_counters = x86_pmu.num_counters;
+
+       if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
+               x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
+               WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
+                    x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
+       }
+
+       perf_counter_mask |=
+               ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
+
+       perf_counters_lapic_init();
+       register_die_notifier(&perf_counter_nmi_notifier);
+
+       pr_info("... version:                 %d\n",     x86_pmu.version);
+       pr_info("... bit width:               %d\n",     x86_pmu.counter_bits);
+       pr_info("... generic counters:        %d\n",     x86_pmu.num_counters);
+       pr_info("... value mask:              %016Lx\n", x86_pmu.counter_mask);
+       pr_info("... max period:              %016Lx\n", x86_pmu.max_period);
+       pr_info("... fixed-purpose counters:  %d\n",     x86_pmu.num_counters_fixed);
+       pr_info("... counter mask:            %016Lx\n", perf_counter_mask);
+}
+
+static inline void x86_pmu_read(struct perf_counter *counter)
+{
+       x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
+}
+
+static const struct pmu pmu = {
+       .enable         = x86_pmu_enable,
+       .disable        = x86_pmu_disable,
+       .read           = x86_pmu_read,
+       .unthrottle     = x86_pmu_unthrottle,
+};
+
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
+{
+       int err;
+
+       err = __hw_perf_counter_init(counter);
+       if (err)
+               return ERR_PTR(err);
+
+       return &pmu;
+}
+
+/*
+ * callchain support
+ */
+
+static inline
+void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
+{
+       if (entry->nr < MAX_STACK_DEPTH)
+               entry->ip[entry->nr++] = ip;
+}
+
+static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+
+
+static void
+backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+       /* Ignore warnings */
+}
+
+static void backtrace_warning(void *data, char *msg)
+{
+       /* Ignore warnings */
+}
+
+static int backtrace_stack(void *data, char *name)
+{
+       /* Don't bother with IRQ stacks for now */
+       return -1;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+       struct perf_callchain_entry *entry = data;
+
+       if (reliable)
+               callchain_store(entry, addr);
+}
+
+static const struct stacktrace_ops backtrace_ops = {
+       .warning                = backtrace_warning,
+       .warning_symbol         = backtrace_warning_symbol,
+       .stack                  = backtrace_stack,
+       .address                = backtrace_address,
+};
+
+static void
+perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+       unsigned long bp;
+       char *stack;
+       int nr = entry->nr;
+
+       callchain_store(entry, instruction_pointer(regs));
+
+       stack = ((char *)regs + sizeof(struct pt_regs));
+#ifdef CONFIG_FRAME_POINTER
+       bp = frame_pointer(regs);
+#else
+       bp = 0;
+#endif
+
+       dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
+
+       entry->kernel = entry->nr - nr;
+}
+
+
+struct stack_frame {
+       const void __user       *next_fp;
+       unsigned long           return_address;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+       int ret;
+
+       if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+               return 0;
+
+       ret = 1;
+       pagefault_disable();
+       if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+               ret = 0;
+       pagefault_enable();
+
+       return ret;
+}
+
+static void
+perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+       struct stack_frame frame;
+       const void __user *fp;
+       int nr = entry->nr;
+
+       regs = (struct pt_regs *)current->thread.sp0 - 1;
+       fp   = (void __user *)regs->bp;
+
+       callchain_store(entry, regs->ip);
+
+       while (entry->nr < MAX_STACK_DEPTH) {
+               frame.next_fp        = NULL;
+               frame.return_address = 0;
+
+               if (!copy_stack_frame(fp, &frame))
+                       break;
+
+               if ((unsigned long)fp < user_stack_pointer(regs))
+                       break;
+
+               callchain_store(entry, frame.return_address);
+               fp = frame.next_fp;
+       }
+
+       entry->user = entry->nr - nr;
+}
+
+static void
+perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+       int is_user;
+
+       if (!regs)
+               return;
+
+       is_user = user_mode(regs);
+
+       if (!current || current->pid == 0)
+               return;
+
+       if (is_user && current->state != TASK_RUNNING)
+               return;
+
+       if (!is_user)
+               perf_callchain_kernel(regs, entry);
+
+       if (current->mm)
+               perf_callchain_user(regs, entry);
+}
+
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+       struct perf_callchain_entry *entry;
+
+       if (in_nmi())
+               entry = &__get_cpu_var(nmi_entry);
+       else
+               entry = &__get_cpu_var(irq_entry);
+
+       entry->nr = 0;
+       entry->hv = 0;
+       entry->kernel = 0;
+       entry->user = 0;
+
+       perf_do_callchain(regs, entry);
+
+       return entry;
+}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c

index f6c70a164e320eeb58ab097ce7ee2d9079e29ce4..d6f5b9fbde3253a2a8405dfaa1070184e8288d72 100644 (file)
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,8 +19,8 @@
  #include <linux/nmi.h>
  #include <linux/kprobes.h>
  
-#include <asm/genapic.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/apic.h>
+#include <asm/perf_counter.h>
  
  struct nmi_watchdog_ctlblk {
         unsigned int cccr_msr;
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c

index 87b67e3a765ac212c2c954de3566ec487fdac8da..48bfe1386038c91fc9ee5b99473e67df5a93a432 100644 (file)
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -19,45 +19,61 @@
   * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
   */
  
-
-#include <asm/ds.h>
-
-#include <linux/errno.h>
+#include <linux/kernel.h>
  #include <linux/string.h>
-#include <linux/slab.h>
+#include <linux/errno.h>
  #include <linux/sched.h>
+#include <linux/slab.h>
  #include <linux/mm.h>
-#include <linux/kernel.h>
+#include <linux/trace_clock.h>
+
+#include <asm/ds.h>
  
+#include "ds_selftest.h"
  
  /*
- * The configuration for a particular DS hardware implementation.
+ * The configuration for a particular DS hardware implementation:
   */
  struct ds_configuration {
-       /* the name of the configuration */
-       const char *name;
-       /* the size of one pointer-typed field in the DS structure and
-          in the BTS and PEBS buffers in bytes;
-          this covers the first 8 DS fields related to buffer management. */
-       unsigned char  sizeof_field;
-       /* the size of a BTS/PEBS record in bytes */
-       unsigned char  sizeof_rec[2];
-       /* a series of bit-masks to control various features indexed
-        * by enum ds_feature */
-       unsigned long ctl[dsf_ctl_max];
+       /* The name of the configuration: */
+       const char              *name;
+
+       /* The size of pointer-typed fields in DS, BTS, and PEBS: */
+       unsigned char           sizeof_ptr_field;
+
+       /* The size of a BTS/PEBS record in bytes: */
+       unsigned char           sizeof_rec[2];
+
+       /* The number of pebs counter reset values in the DS structure. */
+       unsigned char           nr_counter_reset;
+
+       /* Control bit-masks indexed by enum ds_feature: */
+       unsigned long           ctl[dsf_ctl_max];
  };
-static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
+static struct ds_configuration ds_cfg __read_mostly;
+
+
+/* Maximal size of a DS configuration: */
+#define MAX_SIZEOF_DS          0x80
  
-#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
+/* Maximal size of a BTS record: */
+#define MAX_SIZEOF_BTS         (3 * 8)
  
-#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */
-#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */
-#define DS_ALIGNMENT (1 << 3)  /* BTS and PEBS buffer alignment */
+/* BTS and PEBS buffer alignment: */
+#define DS_ALIGNMENT           (1 << 3)
  
-#define BTS_CONTROL \
- (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\
-  ds_cfg.ctl[dsf_bts_overflow])
+/* Number of buffer pointers in DS: */
+#define NUM_DS_PTR_FIELDS      8
  
+/* Size of a pebs reset value in DS: */
+#define PEBS_RESET_FIELD_SIZE  8
+
+/* Mask of control bits in the DS MSR register: */
+#define BTS_CONTROL                              \
+       ( ds_cfg.ctl[dsf_bts]                   | \
+         ds_cfg.ctl[dsf_bts_kernel]            | \
+         ds_cfg.ctl[dsf_bts_user]              | \
+         ds_cfg.ctl[dsf_bts_overflow] )
  
  /*
   * A BTS or PEBS tracer.
@@ -66,29 +82,36 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
   * to identify tracers.
   */
  struct ds_tracer {
-       /* the DS context (partially) owned by this tracer */
-       struct ds_context *context;
-       /* the buffer provided on ds_request() and its size in bytes */
-       void *buffer;
-       size_t size;
+       /* The DS context (partially) owned by this tracer. */
+       struct ds_context       *context;
+       /* The buffer provided on ds_request() and its size in bytes. */
+       void                    *buffer;
+       size_t                  size;
  };
  
  struct bts_tracer {
-       /* the common DS part */
-       struct ds_tracer ds;
-       /* the trace including the DS configuration */
-       struct bts_trace trace;
-       /* buffer overflow notification function */
-       bts_ovfl_callback_t ovfl;
+       /* The common DS part: */
+       struct ds_tracer        ds;
+
+       /* The trace including the DS configuration: */
+       struct bts_trace        trace;
+
+       /* Buffer overflow notification function: */
+       bts_ovfl_callback_t     ovfl;
+
+       /* Active flags affecting trace collection. */
+       unsigned int            flags;
  };
  
  struct pebs_tracer {
-       /* the common DS part */
-       struct ds_tracer ds;
-       /* the trace including the DS configuration */
-       struct pebs_trace trace;
-       /* buffer overflow notification function */
-       pebs_ovfl_callback_t ovfl;
+       /* The common DS part: */
+       struct ds_tracer        ds;
+
+       /* The trace including the DS configuration: */
+       struct pebs_trace       trace;
+
+       /* Buffer overflow notification function: */
+       pebs_ovfl_callback_t    ovfl;
  };
  
  /*
@@ -97,6 +120,7 @@ struct pebs_tracer {
   *
   * The DS configuration consists of the following fields; different
   * architetures vary in the size of those fields.
+ *
   * - double-word aligned base linear address of the BTS buffer
   * - write pointer into the BTS buffer
   * - end linear address of the BTS buffer (one byte beyond the end of
@@ -135,21 +159,22 @@ enum ds_field {
  };
  
  enum ds_qualifier {
-       ds_bts  = 0,
+       ds_bts = 0,
         ds_pebs
  };
  
-static inline unsigned long ds_get(const unsigned char *base,
-                                  enum ds_qualifier qual, enum ds_field field)
+static inline unsigned long
+ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
  {
-       base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+       base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
         return *(unsigned long *)base;
  }
  
-static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
-                         enum ds_field field, unsigned long value)
+static inline void
+ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
+       unsigned long value)
  {
-       base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+       base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
         (*(unsigned long *)base) = value;
  }
  
@@ -159,7 +184,6 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
   */
  static DEFINE_SPINLOCK(ds_lock);
  
-
  /*
   * We either support (system-wide) per-cpu or per-thread allocation.
   * We distinguish the two based on the task_struct pointer, where a
@@ -178,12 +202,28 @@ static DEFINE_SPINLOCK(ds_lock);
   */
  static atomic_t tracers = ATOMIC_INIT(0);
  
-static inline void get_tracer(struct task_struct *task)
+static inline int get_tracer(struct task_struct *task)
  {
-       if (task)
+       int error;
+
+       spin_lock_irq(&ds_lock);
+
+       if (task) {
+               error = -EPERM;
+               if (atomic_read(&tracers) < 0)
+                       goto out;
                 atomic_inc(&tracers);
-       else
+       } else {
+               error = -EPERM;
+               if (atomic_read(&tracers) > 0)
+                       goto out;
                 atomic_dec(&tracers);
+       }
+
+       error = 0;
+out:
+       spin_unlock_irq(&ds_lock);
+       return error;
  }
  
  static inline void put_tracer(struct task_struct *task)
@@ -194,14 +234,6 @@ static inline void put_tracer(struct task_struct *task)
                 atomic_inc(&tracers);
  }
  
-static inline int check_tracer(struct task_struct *task)
-{
-       return task ?
-               (atomic_read(&tracers) >= 0) :
-               (atomic_read(&tracers) <= 0);
-}
-
-
  /*
   * The DS context is either attached to a thread or to a cpu:
   * - in the former case, the thread_struct contains a pointer to the
@@ -213,61 +245,58 @@ static inline int check_tracer(struct task_struct *task)
   * deallocated when the last user puts the context.
   */
  struct ds_context {
-       /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
-       unsigned char ds[MAX_SIZEOF_DS];
-       /* the owner of the BTS and PEBS configuration, respectively */
-       struct bts_tracer *bts_master;
-       struct pebs_tracer *pebs_master;
-       /* use count */
-       unsigned long count;
-       /* a pointer to the context location inside the thread_struct
-        * or the per_cpu context array */
-       struct ds_context **this;
-       /* a pointer to the task owning this context, or NULL, if the
-        * context is owned by a cpu */
-       struct task_struct *task;
-};
+       /* The DS configuration; goes into MSR_IA32_DS_AREA: */
+       unsigned char           ds[MAX_SIZEOF_DS];
+
+       /* The owner of the BTS and PEBS configuration, respectively: */
+       struct bts_tracer       *bts_master;
+       struct pebs_tracer      *pebs_master;
  
-static DEFINE_PER_CPU(struct ds_context *, system_context_array);
+       /* Use count: */
+       unsigned long           count;
  
-#define system_context per_cpu(system_context_array, smp_processor_id())
+       /* Pointer to the context pointer field: */
+       struct ds_context       **this;
+
+       /* The traced task; NULL for cpu tracing: */
+       struct task_struct      *task;
+
+       /* The traced cpu; only valid if task is NULL: */
+       int                     cpu;
+};
  
+static DEFINE_PER_CPU(struct ds_context *, cpu_context);
  
-static inline struct ds_context *ds_get_context(struct task_struct *task)
+
+static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
  {
         struct ds_context **p_context =
-               (task ? &task->thread.ds_ctx : &system_context);
+               (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu));
         struct ds_context *context = NULL;
         struct ds_context *new_context = NULL;
-       unsigned long irq;
  
         /* Chances are small that we already have a context. */
         new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
         if (!new_context)
                 return NULL;
  
-       spin_lock_irqsave(&ds_lock, irq);
+       spin_lock_irq(&ds_lock);
  
         context = *p_context;
-       if (!context) {
+       if (likely(!context)) {
                 context = new_context;
  
                 context->this = p_context;
                 context->task = task;
+               context->cpu = cpu;
                 context->count = 0;
  
-               if (task)
-                       set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
-
-               if (!task || (task == current))
-                       wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds);
-
                 *p_context = context;
         }
  
         context->count++;
  
-       spin_unlock_irqrestore(&ds_lock, irq);
+       spin_unlock_irq(&ds_lock);
  
         if (context != new_context)
                 kfree(new_context);
@@ -275,8 +304,9 @@ static inline struct ds_context *ds_get_context(struct task_struct *task)
         return context;
  }
  
-static inline void ds_put_context(struct ds_context *context)
+static void ds_put_context(struct ds_context *context)
  {
+       struct task_struct *task;
         unsigned long irq;
  
         if (!context)
@@ -291,17 +321,55 @@ static inline void ds_put_context(struct ds_context *context)
  
         *(context->this) = NULL;
  
-       if (context->task)
-               clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
+       task = context->task;
+
+       if (task)
+               clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
  
-       if (!context->task || (context->task == current))
-               wrmsrl(MSR_IA32_DS_AREA, 0);
+       /*
+        * We leave the (now dangling) pointer to the DS configuration in
+        * the DS_AREA msr. This is as good or as bad as replacing it with
+        * NULL - the hardware would crash if we enabled tracing.
+        *
+        * This saves us some problems with having to write an msr on a
+        * different cpu while preventing others from doing the same for the
+        * next context for that same cpu.
+        */
  
         spin_unlock_irqrestore(&ds_lock, irq);
  
+       /* The context might still be in use for context switching. */
+       if (task && (task != current))
+               wait_task_context_switch(task);
+
         kfree(context);
  }
  
+static void ds_install_ds_area(struct ds_context *context)
+{
+       unsigned long ds;
+
+       ds = (unsigned long)context->ds;
+
+       /*
+        * There is a race between the bts master and the pebs master.
+        *
+        * The thread/cpu access is synchronized via get/put_cpu() for
+        * task tracing and via wrmsr_on_cpu for cpu tracing.
+        *
+        * If bts and pebs are collected for the same task or same cpu,
+        * the same confiuration is written twice.
+        */
+       if (context->task) {
+               get_cpu();
+               if (context->task == current)
+                       wrmsrl(MSR_IA32_DS_AREA, ds);
+               set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
+               put_cpu();
+       } else
+               wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
+                            (u32)((u64)ds), (u32)((u64)ds >> 32));
+}
  
  /*
   * Call the tracer's callback on a buffer overflow.
@@ -332,9 +400,9 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
   * The remainder of any partially written record is zeroed out.
   *
   * context: the DS context
- * qual: the buffer type
- * record: the data to write
- * size: the size of the data
+ * qual:    the buffer type
+ * record:  the data to write
+ * size:    the size of the data
   */
  static int ds_write(struct ds_context *context, enum ds_qualifier qual,
                     const void *record, size_t size)
@@ -349,14 +417,14 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
                 unsigned long write_size, adj_write_size;
  
                 /*
-                * write as much as possible without producing an
+                * Write as much as possible without producing an
                  * overflow interrupt.
                  *
-                * interrupt_threshold must either be
+                * Interrupt_threshold must either be
                  * - bigger than absolute_maximum or
                  * - point to a record between buffer_base and absolute_maximum
                  *
-                * index points to a valid record.
+                * Index points to a valid record.
                  */
                 base   = ds_get(context->ds, qual, ds_buffer_base);
                 index  = ds_get(context->ds, qual, ds_index);
@@ -365,8 +433,10 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
  
                 write_end = min(end, int_th);
  
-               /* if we are already beyond the interrupt threshold,
-                * we fill the entire buffer */
+               /*
+                * If we are already beyond the interrupt threshold,
+                * we fill the entire buffer.
+                */
                 if (write_end <= index)
                         write_end = end;
  
@@ -383,7 +453,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
                 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
                 adj_write_size *= ds_cfg.sizeof_rec[qual];
  
-               /* zero out trailing bytes */
+               /* Zero out trailing bytes. */
                 memset((char *)index + write_size, 0,
                        adj_write_size - write_size);
                 index += adj_write_size;
@@ -410,7 +480,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
   * Later architectures use 64bit pointers throughout, whereas earlier
   * architectures use 32bit pointers in 32bit mode.
   *
- * We compute the base address for the first 8 fields based on:
+ * We compute the base address for the fields based on:
   * - the field size stored in the DS configuration
   * - the relative field position
   *
@@ -431,23 +501,23 @@ enum bts_field {
         bts_to,
         bts_flags,
  
-       bts_qual = bts_from,
-       bts_jiffies = bts_to,
-       bts_pid = bts_flags,
+       bts_qual                = bts_from,
+       bts_clock               = bts_to,
+       bts_pid                 = bts_flags,
  
-       bts_qual_mask = (bts_qual_max - 1),
-       bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
+       bts_qual_mask           = (bts_qual_max - 1),
+       bts_escape              = ((unsigned long)-1 & ~bts_qual_mask)
  };
  
  static inline unsigned long bts_get(const char *base, enum bts_field field)
  {
-       base += (ds_cfg.sizeof_field * field);
+       base += (ds_cfg.sizeof_ptr_field * field);
         return *(unsigned long *)base;
  }
  
  static inline void bts_set(char *base, enum bts_field field, unsigned long val)
  {
-       base += (ds_cfg.sizeof_field * field);;
+       base += (ds_cfg.sizeof_ptr_field * field);;
         (*(unsigned long *)base) = val;
  }
  
@@ -463,8 +533,8 @@ static inline void bts_set(char *base, enum bts_field field, unsigned long val)
   *
   * return: bytes read/written on success; -Eerrno, otherwise
   */
-static int bts_read(struct bts_tracer *tracer, const void *at,
-                   struct bts_struct *out)
+static int
+bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
  {
         if (!tracer)
                 return -EINVAL;
@@ -478,8 +548,8 @@ static int bts_read(struct bts_tracer *tracer, const void *at,
         memset(out, 0, sizeof(*out));
         if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
                 out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
-               out->variant.timestamp.jiffies = bts_get(at, bts_jiffies);
-               out->variant.timestamp.pid = bts_get(at, bts_pid);
+               out->variant.event.clock = bts_get(at, bts_clock);
+               out->variant.event.pid = bts_get(at, bts_pid);
         } else {
                 out->qualifier = bts_branch;
                 out->variant.lbr.from = bts_get(at, bts_from);
@@ -516,8 +586,8 @@ static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
         case bts_task_arrives:
         case bts_task_departs:
                 bts_set(raw, bts_qual, (bts_escape | in->qualifier));
-               bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies);
-               bts_set(raw, bts_pid, in->variant.timestamp.pid);
+               bts_set(raw, bts_clock, in->variant.event.clock);
+               bts_set(raw, bts_pid, in->variant.event.pid);
                 break;
         default:
                 return -EINVAL;
@@ -555,7 +625,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
                              unsigned int flags) {
         unsigned long buffer, adj;
  
-       /* adjust the buffer address and size to meet alignment
+       /*
+        * Adjust the buffer address and size to meet alignment
          * constraints:
          * - buffer is double-word aligned
          * - size is multiple of record size
@@ -577,9 +648,11 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
         trace->begin = (void *)buffer;
         trace->top = trace->begin;
         trace->end = (void *)(buffer + size);
-       /* The value for 'no threshold' is -1, which will set the
+       /*
+        * The value for 'no threshold' is -1, which will set the
          * threshold outside of the buffer, just like we want it.
          */
+       ith *= ds_cfg.sizeof_rec[qual];
         trace->ith = (void *)(buffer + size - ith);
  
         trace->flags = flags;
@@ -588,18 +661,27 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
  
  static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
                       enum ds_qualifier qual, struct task_struct *task,
-                     void *base, size_t size, size_t th, unsigned int flags)
+                     int cpu, void *base, size_t size, size_t th)
  {
         struct ds_context *context;
         int error;
+       size_t req_size;
+
+       error = -EOPNOTSUPP;
+       if (!ds_cfg.sizeof_rec[qual])
+               goto out;
  
         error = -EINVAL;
         if (!base)
                 goto out;
  
-       /* we require some space to do alignment adjustments below */
+       req_size = ds_cfg.sizeof_rec[qual];
+       /* We might need space for alignment adjustments. */
+       if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
+               req_size += DS_ALIGNMENT;
+
         error = -EINVAL;
-       if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
+       if (size < req_size)
                 goto out;
  
         if (th != (size_t)-1) {
@@ -614,182 +696,318 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
         tracer->size = size;
  
         error = -ENOMEM;
-       context = ds_get_context(task);
+       context = ds_get_context(task, cpu);
         if (!context)
                 goto out;
         tracer->context = context;
  
-       ds_init_ds_trace(trace, qual, base, size, th, flags);
+       /*
+        * Defer any tracer-specific initialization work for the context until
+        * context ownership has been clarified.
+        */
  
         error = 0;
   out:
         return error;
  }
  
-struct bts_tracer *ds_request_bts(struct task_struct *task,
-                                 void *base, size_t size,
-                                 bts_ovfl_callback_t ovfl, size_t th,
-                                 unsigned int flags)
+static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
+                                        void *base, size_t size,
+                                        bts_ovfl_callback_t ovfl, size_t th,
+                                        unsigned int flags)
  {
         struct bts_tracer *tracer;
-       unsigned long irq;
         int error;
  
+       /* Buffer overflow notification is not yet implemented. */
         error = -EOPNOTSUPP;
-       if (!ds_cfg.ctl[dsf_bts])
+       if (ovfl)
                 goto out;
  
-       /* buffer overflow notification is not yet implemented */
-       error = -EOPNOTSUPP;
-       if (ovfl)
+       error = get_tracer(task);
+       if (error < 0)
                 goto out;
  
         error = -ENOMEM;
         tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
         if (!tracer)
-               goto out;
+               goto out_put_tracer;
         tracer->ovfl = ovfl;
  
+       /* Do some more error checking and acquire a tracing context. */
         error = ds_request(&tracer->ds, &tracer->trace.ds,
-                          ds_bts, task, base, size, th, flags);
+                          ds_bts, task, cpu, base, size, th);
         if (error < 0)
                 goto out_tracer;
  
-
-       spin_lock_irqsave(&ds_lock, irq);
-
-       error = -EPERM;
-       if (!check_tracer(task))
-               goto out_unlock;
-       get_tracer(task);
+       /* Claim the bts part of the tracing context we acquired above. */
+       spin_lock_irq(&ds_lock);
  
         error = -EPERM;
         if (tracer->ds.context->bts_master)
-               goto out_put_tracer;
+               goto out_unlock;
         tracer->ds.context->bts_master = tracer;
  
-       spin_unlock_irqrestore(&ds_lock, irq);
+       spin_unlock_irq(&ds_lock);
  
+       /*
+        * Now that we own the bts part of the context, let's complete the
+        * initialization for that part.
+        */
+       ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
+       ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+       ds_install_ds_area(tracer->ds.context);
  
         tracer->trace.read  = bts_read;
         tracer->trace.write = bts_write;
  
-       ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+       /* Start tracing. */
         ds_resume_bts(tracer);
  
         return tracer;
  
- out_put_tracer:
-       put_tracer(task);
   out_unlock:
-       spin_unlock_irqrestore(&ds_lock, irq);
+       spin_unlock_irq(&ds_lock);
         ds_put_context(tracer->ds.context);
   out_tracer:
         kfree(tracer);
+ out_put_tracer:
+       put_tracer(task);
   out:
         return ERR_PTR(error);
  }
  
-struct pebs_tracer *ds_request_pebs(struct task_struct *task,
-                                   void *base, size_t size,
-                                   pebs_ovfl_callback_t ovfl, size_t th,
-                                   unsigned int flags)
+struct bts_tracer *ds_request_bts_task(struct task_struct *task,
+                                      void *base, size_t size,
+                                      bts_ovfl_callback_t ovfl,
+                                      size_t th, unsigned int flags)
+{
+       return ds_request_bts(task, 0, base, size, ovfl, th, flags);
+}
+
+struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
+                                     bts_ovfl_callback_t ovfl,
+                                     size_t th, unsigned int flags)
+{
+       return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
+}
+
+static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
+                                          void *base, size_t size,
+                                          pebs_ovfl_callback_t ovfl, size_t th,
+                                          unsigned int flags)
  {
         struct pebs_tracer *tracer;
-       unsigned long irq;
         int error;
  
-       /* buffer overflow notification is not yet implemented */
+       /* Buffer overflow notification is not yet implemented. */
         error = -EOPNOTSUPP;
         if (ovfl)
                 goto out;
  
+       error = get_tracer(task);
+       if (error < 0)
+               goto out;
+
         error = -ENOMEM;
         tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
         if (!tracer)
-               goto out;
+               goto out_put_tracer;
         tracer->ovfl = ovfl;
  
+       /* Do some more error checking and acquire a tracing context. */
         error = ds_request(&tracer->ds, &tracer->trace.ds,
-                          ds_pebs, task, base, size, th, flags);
+                          ds_pebs, task, cpu, base, size, th);
         if (error < 0)
                 goto out_tracer;
  
-       spin_lock_irqsave(&ds_lock, irq);
-
-       error = -EPERM;
-       if (!check_tracer(task))
-               goto out_unlock;
-       get_tracer(task);
+       /* Claim the pebs part of the tracing context we acquired above. */
+       spin_lock_irq(&ds_lock);
  
         error = -EPERM;
         if (tracer->ds.context->pebs_master)
-               goto out_put_tracer;
+               goto out_unlock;
         tracer->ds.context->pebs_master = tracer;
  
-       spin_unlock_irqrestore(&ds_lock, irq);
+       spin_unlock_irq(&ds_lock);
  
+       /*
+        * Now that we own the pebs part of the context, let's complete the
+        * initialization for that part.
+        */
+       ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
         ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
+       ds_install_ds_area(tracer->ds.context);
+
+       /* Start tracing. */
         ds_resume_pebs(tracer);
  
         return tracer;
  
- out_put_tracer:
-       put_tracer(task);
   out_unlock:
-       spin_unlock_irqrestore(&ds_lock, irq);
+       spin_unlock_irq(&ds_lock);
         ds_put_context(tracer->ds.context);
   out_tracer:
         kfree(tracer);
+ out_put_tracer:
+       put_tracer(task);
   out:
         return ERR_PTR(error);
  }
  
-void ds_release_bts(struct bts_tracer *tracer)
+struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
+                                        void *base, size_t size,
+                                        pebs_ovfl_callback_t ovfl,
+                                        size_t th, unsigned int flags)
  {
-       if (!tracer)
-               return;
+       return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
+}
  
-       ds_suspend_bts(tracer);
+struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
+                                       pebs_ovfl_callback_t ovfl,
+                                       size_t th, unsigned int flags)
+{
+       return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
+}
+
+static void ds_free_bts(struct bts_tracer *tracer)
+{
+       struct task_struct *task;
+
+       task = tracer->ds.context->task;
  
         WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
         tracer->ds.context->bts_master = NULL;
  
-       put_tracer(tracer->ds.context->task);
+       /* Make sure tracing stopped and the tracer is not in use. */
+       if (task && (task != current))
+               wait_task_context_switch(task);
+
         ds_put_context(tracer->ds.context);
+       put_tracer(task);
  
         kfree(tracer);
  }
  
+void ds_release_bts(struct bts_tracer *tracer)
+{
+       might_sleep();
+
+       if (!tracer)
+               return;
+
+       ds_suspend_bts(tracer);
+       ds_free_bts(tracer);
+}
+
+int ds_release_bts_noirq(struct bts_tracer *tracer)
+{
+       struct task_struct *task;
+       unsigned long irq;
+       int error;
+
+       if (!tracer)
+               return 0;
+
+       task = tracer->ds.context->task;
+
+       local_irq_save(irq);
+
+       error = -EPERM;
+       if (!task &&
+           (tracer->ds.context->cpu != smp_processor_id()))
+               goto out;
+
+       error = -EPERM;
+       if (task && (task != current))
+               goto out;
+
+       ds_suspend_bts_noirq(tracer);
+       ds_free_bts(tracer);
+
+       error = 0;
+ out:
+       local_irq_restore(irq);
+       return error;
+}
+
+static void update_task_debugctlmsr(struct task_struct *task,
+                                   unsigned long debugctlmsr)
+{
+       task->thread.debugctlmsr = debugctlmsr;
+
+       get_cpu();
+       if (task == current)
+               update_debugctlmsr(debugctlmsr);
+       put_cpu();
+}
+
  void ds_suspend_bts(struct bts_tracer *tracer)
  {
         struct task_struct *task;
+       unsigned long debugctlmsr;
+       int cpu;
  
         if (!tracer)
                 return;
  
+       tracer->flags = 0;
+
         task = tracer->ds.context->task;
+       cpu  = tracer->ds.context->cpu;
  
-       if (!task || (task == current))
-               update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
+       WARN_ON(!task && irqs_disabled());
  
-       if (task) {
-               task->thread.debugctlmsr &= ~BTS_CONTROL;
+       debugctlmsr = (task ?
+                      task->thread.debugctlmsr :
+                      get_debugctlmsr_on_cpu(cpu));
+       debugctlmsr &= ~BTS_CONTROL;
  
-               if (!task->thread.debugctlmsr)
-                       clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
-       }
+       if (task)
+               update_task_debugctlmsr(task, debugctlmsr);
+       else
+               update_debugctlmsr_on_cpu(cpu, debugctlmsr);
  }
  
-void ds_resume_bts(struct bts_tracer *tracer)
+int ds_suspend_bts_noirq(struct bts_tracer *tracer)
  {
         struct task_struct *task;
-       unsigned long control;
+       unsigned long debugctlmsr, irq;
+       int cpu, error = 0;
  
         if (!tracer)
-               return;
+               return 0;
+
+       tracer->flags = 0;
  
         task = tracer->ds.context->task;
+       cpu  = tracer->ds.context->cpu;
+
+       local_irq_save(irq);
+
+       error = -EPERM;
+       if (!task && (cpu != smp_processor_id()))
+               goto out;
+
+       debugctlmsr = (task ?
+                      task->thread.debugctlmsr :
+                      get_debugctlmsr());
+       debugctlmsr &= ~BTS_CONTROL;
+
+       if (task)
+               update_task_debugctlmsr(task, debugctlmsr);
+       else
+               update_debugctlmsr(debugctlmsr);
+
+       error = 0;
+ out:
+       local_irq_restore(irq);
+       return error;
+}
+
+static unsigned long ds_bts_control(struct bts_tracer *tracer)
+{
+       unsigned long control;
  
         control = ds_cfg.ctl[dsf_bts];
         if (!(tracer->trace.ds.flags & BTS_KERNEL))
@@ -797,41 +1015,149 @@ void ds_resume_bts(struct bts_tracer *tracer)
         if (!(tracer->trace.ds.flags & BTS_USER))
                 control |= ds_cfg.ctl[dsf_bts_user];
  
-       if (task) {
-               task->thread.debugctlmsr |= control;
-               set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
-       }
-
-       if (!task || (task == current))
-               update_debugctlmsr(get_debugctlmsr() | control);
+       return control;
  }
  
-void ds_release_pebs(struct pebs_tracer *tracer)
+void ds_resume_bts(struct bts_tracer *tracer)
  {
+       struct task_struct *task;
+       unsigned long debugctlmsr;
+       int cpu;
+
         if (!tracer)
                 return;
  
-       ds_suspend_pebs(tracer);
+       tracer->flags = tracer->trace.ds.flags;
+
+       task = tracer->ds.context->task;
+       cpu  = tracer->ds.context->cpu;
+
+       WARN_ON(!task && irqs_disabled());
+
+       debugctlmsr = (task ?
+                      task->thread.debugctlmsr :
+                      get_debugctlmsr_on_cpu(cpu));
+       debugctlmsr |= ds_bts_control(tracer);
+
+       if (task)
+               update_task_debugctlmsr(task, debugctlmsr);
+       else
+               update_debugctlmsr_on_cpu(cpu, debugctlmsr);
+}
+
+int ds_resume_bts_noirq(struct bts_tracer *tracer)
+{
+       struct task_struct *task;
+       unsigned long debugctlmsr, irq;
+       int cpu, error = 0;
+
+       if (!tracer)
+               return 0;
+
+       tracer->flags = tracer->trace.ds.flags;
+
+       task = tracer->ds.context->task;
+       cpu  = tracer->ds.context->cpu;
+
+       local_irq_save(irq);
+
+       error = -EPERM;
+       if (!task && (cpu != smp_processor_id()))
+               goto out;
+
+       debugctlmsr = (task ?
+                      task->thread.debugctlmsr :
+                      get_debugctlmsr());
+       debugctlmsr |= ds_bts_control(tracer);
+
+       if (task)
+               update_task_debugctlmsr(task, debugctlmsr);
+       else
+               update_debugctlmsr(debugctlmsr);
+
+       error = 0;
+ out:
+       local_irq_restore(irq);
+       return error;
+}
+
+static void ds_free_pebs(struct pebs_tracer *tracer)
+{
+       struct task_struct *task;
+
+       task = tracer->ds.context->task;
  
         WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
         tracer->ds.context->pebs_master = NULL;
  
-       put_tracer(tracer->ds.context->task);
         ds_put_context(tracer->ds.context);
+       put_tracer(task);
  
         kfree(tracer);
  }
  
+void ds_release_pebs(struct pebs_tracer *tracer)
+{
+       might_sleep();
+
+       if (!tracer)
+               return;
+
+       ds_suspend_pebs(tracer);
+       ds_free_pebs(tracer);
+}
+
+int ds_release_pebs_noirq(struct pebs_tracer *tracer)
+{
+       struct task_struct *task;
+       unsigned long irq;
+       int error;
+
+       if (!tracer)
+               return 0;
+
+       task = tracer->ds.context->task;
+
+       local_irq_save(irq);
+
+       error = -EPERM;
+       if (!task &&
+           (tracer->ds.context->cpu != smp_processor_id()))
+               goto out;
+
+       error = -EPERM;
+       if (task && (task != current))
+               goto out;
+
+       ds_suspend_pebs_noirq(tracer);
+       ds_free_pebs(tracer);
+
+       error = 0;
+ out:
+       local_irq_restore(irq);
+       return error;
+}
+
  void ds_suspend_pebs(struct pebs_tracer *tracer)
  {
  
  }
  
+int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
+{
+       return 0;
+}
+
  void ds_resume_pebs(struct pebs_tracer *tracer)
  {
  
  }
  
+int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
+{
+       return 0;
+}
+
  const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
  {
         if (!tracer)
@@ -847,8 +1173,12 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
                 return NULL;
  
         ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
-       tracer->trace.reset_value =
-               *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
+
+       tracer->trace.counters = ds_cfg.nr_counter_reset;
+       memcpy(tracer->trace.counter_reset,
+              tracer->ds.context->ds +
+              (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
+              ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
  
         return &tracer->trace;
  }
@@ -873,18 +1203,24 @@ int ds_reset_pebs(struct pebs_tracer *tracer)
  
         tracer->trace.ds.top = tracer->trace.ds.begin;
  
-       ds_set(tracer->ds.context->ds, ds_bts, ds_index,
+       ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
                (unsigned long)tracer->trace.ds.top);
  
         return 0;
  }
  
-int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
+int ds_set_pebs_reset(struct pebs_tracer *tracer,
+                     unsigned int counter, u64 value)
  {
         if (!tracer)
                 return -EINVAL;
  
-       *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
+       if (ds_cfg.nr_counter_reset < counter)
+               return -EINVAL;
+
+       *(u64 *)(tracer->ds.context->ds +
+                (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
+                (counter * PEBS_RESET_FIELD_SIZE)) = value;
  
         return 0;
  }
@@ -894,73 +1230,117 @@ static const struct ds_configuration ds_cfg_netburst = {
         .ctl[dsf_bts]           = (1 << 2) | (1 << 3),
         .ctl[dsf_bts_kernel]    = (1 << 5),
         .ctl[dsf_bts_user]      = (1 << 6),
-
-       .sizeof_field           = sizeof(long),
-       .sizeof_rec[ds_bts]     = sizeof(long) * 3,
-#ifdef __i386__
-       .sizeof_rec[ds_pebs]    = sizeof(long) * 10,
-#else
-       .sizeof_rec[ds_pebs]    = sizeof(long) * 18,
-#endif
+       .nr_counter_reset       = 1,
  };
  static const struct ds_configuration ds_cfg_pentium_m = {
         .name = "Pentium M",
         .ctl[dsf_bts]           = (1 << 6) | (1 << 7),
-
-       .sizeof_field           = sizeof(long),
-       .sizeof_rec[ds_bts]     = sizeof(long) * 3,
-#ifdef __i386__
-       .sizeof_rec[ds_pebs]    = sizeof(long) * 10,
-#else
-       .sizeof_rec[ds_pebs]    = sizeof(long) * 18,
-#endif
+       .nr_counter_reset       = 1,
  };
  static const struct ds_configuration ds_cfg_core2_atom = {
         .name = "Core 2/Atom",
         .ctl[dsf_bts]           = (1 << 6) | (1 << 7),
         .ctl[dsf_bts_kernel]    = (1 << 9),
         .ctl[dsf_bts_user]      = (1 << 10),
-
-       .sizeof_field           = 8,
-       .sizeof_rec[ds_bts]     = 8 * 3,
-       .sizeof_rec[ds_pebs]    = 8 * 18,
+       .nr_counter_reset       = 1,
+};
+static const struct ds_configuration ds_cfg_core_i7 = {
+       .name = "Core i7",
+       .ctl[dsf_bts]           = (1 << 6) | (1 << 7),
+       .ctl[dsf_bts_kernel]    = (1 << 9),
+       .ctl[dsf_bts_user]      = (1 << 10),
+       .nr_counter_reset       = 4,
  };
  
  static void
-ds_configure(const struct ds_configuration *cfg)
+ds_configure(const struct ds_configuration *cfg,
+            struct cpuinfo_x86 *cpu)
  {
+       unsigned long nr_pebs_fields = 0;
+
+       printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
+
+#ifdef __i386__
+       nr_pebs_fields = 10;
+#else
+       nr_pebs_fields = 18;
+#endif
+
+       /*
+        * Starting with version 2, architectural performance
+        * monitoring supports a format specifier.
+        */
+       if ((cpuid_eax(0xa) & 0xff) > 1) {
+               unsigned long perf_capabilities, format;
+
+               rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
+
+               format = (perf_capabilities >> 8) & 0xf;
+
+               switch (format) {
+               case 0:
+                       nr_pebs_fields = 18;
+                       break;
+               case 1:
+                       nr_pebs_fields = 22;
+                       break;
+               default:
+                       printk(KERN_INFO
+                              "[ds] unknown PEBS format: %lu\n", format);
+                       nr_pebs_fields = 0;
+                       break;
+               }
+       }
+
         memset(&ds_cfg, 0, sizeof(ds_cfg));
         ds_cfg = *cfg;
  
-       printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name);
+       ds_cfg.sizeof_ptr_field =
+               (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
+
+       ds_cfg.sizeof_rec[ds_bts]  = ds_cfg.sizeof_ptr_field * 3;
+       ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
  
-       if (!cpu_has_bts) {
-               ds_cfg.ctl[dsf_bts] = 0;
+       if (!cpu_has(cpu, X86_FEATURE_BTS)) {
+               ds_cfg.sizeof_rec[ds_bts] = 0;
                 printk(KERN_INFO "[ds] bts not available\n");
         }
-       if (!cpu_has_pebs)
+       if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
+               ds_cfg.sizeof_rec[ds_pebs] = 0;
                 printk(KERN_INFO "[ds] pebs not available\n");
+       }
+
+       printk(KERN_INFO "[ds] sizes: address: %u bit, ",
+              8 * ds_cfg.sizeof_ptr_field);
+       printk("bts/pebs record: %u/%u bytes\n",
+              ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
  
-       WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field));
+       WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
  }
  
  void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
  {
+       /* Only configure the first cpu. Others are identical. */
+       if (ds_cfg.name)
+               return;
+
         switch (c->x86) {
         case 0x6:
                 switch (c->x86_model) {
                 case 0x9:
                 case 0xd: /* Pentium M */
-                       ds_configure(&ds_cfg_pentium_m);
+                       ds_configure(&ds_cfg_pentium_m, c);
                         break;
                 case 0xf:
                 case 0x17: /* Core2 */
                 case 0x1c: /* Atom */
-                       ds_configure(&ds_cfg_core2_atom);
+                       ds_configure(&ds_cfg_core2_atom, c);
+                       break;
+               case 0x1a: /* Core i7 */
+                       ds_configure(&ds_cfg_core_i7, c);
                         break;
-               case 0x1a: /* i7 */
                 default:
-                       /* sorry, don't know about them */
+                       /* Sorry, don't know about them. */
                         break;
                 }
                 break;
@@ -969,64 +1349,89 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
                 case 0x0:
                 case 0x1:
                 case 0x2: /* Netburst */
-                       ds_configure(&ds_cfg_netburst);
+                       ds_configure(&ds_cfg_netburst, c);
                         break;
                 default:
-                       /* sorry, don't know about them */
+                       /* Sorry, don't know about them. */
                         break;
                 }
                 break;
         default:
-               /* sorry, don't know about them */
+               /* Sorry, don't know about them. */
                 break;
         }
  }
  
+static inline void ds_take_timestamp(struct ds_context *context,
+                                    enum bts_qualifier qualifier,
+                                    struct task_struct *task)
+{
+       struct bts_tracer *tracer = context->bts_master;
+       struct bts_struct ts;
+
+       /* Prevent compilers from reading the tracer pointer twice. */
+       barrier();
+
+       if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
+               return;
+
+       memset(&ts, 0, sizeof(ts));
+       ts.qualifier            = qualifier;
+       ts.variant.event.clock  = trace_clock_global();
+       ts.variant.event.pid    = task->pid;
+
+       bts_write(tracer, &ts);
+}
+
  /*
   * Change the DS configuration from tracing prev to tracing next.
   */
  void ds_switch_to(struct task_struct *prev, struct task_struct *next)
  {
-       struct ds_context *prev_ctx = prev->thread.ds_ctx;
-       struct ds_context *next_ctx = next->thread.ds_ctx;
+       struct ds_context *prev_ctx     = prev->thread.ds_ctx;
+       struct ds_context *next_ctx     = next->thread.ds_ctx;
+       unsigned long debugctlmsr       = next->thread.debugctlmsr;
+
+       /* Make sure all data is read before we start. */
+       barrier();
  
         if (prev_ctx) {
                 update_debugctlmsr(0);
  
-               if (prev_ctx->bts_master &&
-                   (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
-                       struct bts_struct ts = {
-                               .qualifier = bts_task_departs,
-                               .variant.timestamp.jiffies = jiffies_64,
-                               .variant.timestamp.pid = prev->pid
-                       };
-                       bts_write(prev_ctx->bts_master, &ts);
-               }
+               ds_take_timestamp(prev_ctx, bts_task_departs, prev);
         }
  
         if (next_ctx) {
-               if (next_ctx->bts_master &&
-                   (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
-                       struct bts_struct ts = {
-                               .qualifier = bts_task_arrives,
-                               .variant.timestamp.jiffies = jiffies_64,
-                               .variant.timestamp.pid = next->pid
-                       };
-                       bts_write(next_ctx->bts_master, &ts);
-               }
+               ds_take_timestamp(next_ctx, bts_task_arrives, next);
  
                 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
         }
  
-       update_debugctlmsr(next->thread.debugctlmsr);
+       update_debugctlmsr(debugctlmsr);
  }
  
-void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
+static __init int ds_selftest(void)
  {
-       clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR);
-       tsk->thread.ds_ctx = NULL;
-}
+       if (ds_cfg.sizeof_rec[ds_bts]) {
+               int error;
  
-void ds_exit_thread(struct task_struct *tsk)
-{
+               error = ds_selftest_bts();
+               if (error) {
+                       WARN(1, "[ds] selftest failed. disabling bts.\n");
+                       ds_cfg.sizeof_rec[ds_bts] = 0;
+               }
+       }
+
+       if (ds_cfg.sizeof_rec[ds_pebs]) {
+               int error;
+
+               error = ds_selftest_pebs();
+               if (error) {
+                       WARN(1, "[ds] selftest failed. disabling pebs.\n");
+                       ds_cfg.sizeof_rec[ds_pebs] = 0;
+               }
+       }
+
+       return 0;
  }
+device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c

new file mode 100644 (file)

index 0000000..6bc7c19
--- /dev/null
+++ b/arch/x86/kernel/ds_selftest.c
@@ -0,0 +1,408 @@
+/*
+ * Debug Store support - selftest
+ *
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2009
+ */
+
+#include "ds_selftest.h"
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+
+#include <asm/ds.h>
+
+
+#define BUFFER_SIZE            521     /* Intentionally chose an odd size. */
+#define SMALL_BUFFER_SIZE      24      /* A single bts entry. */
+
+struct ds_selftest_bts_conf {
+       struct bts_tracer *tracer;
+       int error;
+       int (*suspend)(struct bts_tracer *);
+       int (*resume)(struct bts_tracer *);
+};
+
+static int ds_selftest_bts_consistency(const struct bts_trace *trace)
+{
+       int error = 0;
+
+       if (!trace) {
+               printk(KERN_CONT "failed to access trace...");
+               /* Bail out. Other tests are pointless. */
+               return -1;
+       }
+
+       if (!trace->read) {
+               printk(KERN_CONT "bts read not available...");
+               error = -1;
+       }
+
+       /* Do some sanity checks on the trace configuration. */
+       if (!trace->ds.n) {
+               printk(KERN_CONT "empty bts buffer...");
+               error = -1;
+       }
+       if (!trace->ds.size) {
+               printk(KERN_CONT "bad bts trace setup...");
+               error = -1;
+       }
+       if (trace->ds.end !=
+           (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
+               printk(KERN_CONT "bad bts buffer setup...");
+               error = -1;
+       }
+       /*
+        * We allow top in [begin; end], since its not clear when the
+        * overflow adjustment happens: after the increment or before the
+        * write.
+        */
+       if ((trace->ds.top < trace->ds.begin) ||
+           (trace->ds.end < trace->ds.top)) {
+               printk(KERN_CONT "bts top out of bounds...");
+               error = -1;
+       }
+
+       return error;
+}
+
+static int ds_selftest_bts_read(struct bts_tracer *tracer,
+                               const struct bts_trace *trace,
+                               const void *from, const void *to)
+{
+       const unsigned char *at;
+
+       /*
+        * Check a few things which do not belong to this test.
+        * They should be covered by other tests.
+        */
+       if (!trace)
+               return -1;
+
+       if (!trace->read)
+               return -1;
+
+       if (to < from)
+               return -1;
+
+       if (from < trace->ds.begin)
+               return -1;
+
+       if (trace->ds.end < to)
+               return -1;
+
+       if (!trace->ds.size)
+               return -1;
+
+       /* Now to the test itself. */
+       for (at = from; (void *)at < to; at += trace->ds.size) {
+               struct bts_struct bts;
+               unsigned long index;
+               int error;
+
+               if (((void *)at - trace->ds.begin) % trace->ds.size) {
+                       printk(KERN_CONT
+                              "read from non-integer index...");
+                       return -1;
+               }
+               index = ((void *)at - trace->ds.begin) / trace->ds.size;
+
+               memset(&bts, 0, sizeof(bts));
+               error = trace->read(tracer, at, &bts);
+               if (error < 0) {
+                       printk(KERN_CONT
+                              "error reading bts trace at [%lu] (0x%p)...",
+                              index, at);
+                       return error;
+               }
+
+               switch (bts.qualifier) {
+               case BTS_BRANCH:
+                       break;
+               default:
+                       printk(KERN_CONT
+                              "unexpected bts entry %llu at [%lu] (0x%p)...",
+                              bts.qualifier, index, at);
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+static void ds_selftest_bts_cpu(void *arg)
+{
+       struct ds_selftest_bts_conf *conf = arg;
+       const struct bts_trace *trace;
+       void *top;
+
+       if (IS_ERR(conf->tracer)) {
+               conf->error = PTR_ERR(conf->tracer);
+               conf->tracer = NULL;
+
+               printk(KERN_CONT
+                      "initialization failed (err: %d)...", conf->error);
+               return;
+       }
+
+       /* We should meanwhile have enough trace. */
+       conf->error = conf->suspend(conf->tracer);
+       if (conf->error < 0)
+               return;
+
+       /* Let's see if we can access the trace. */
+       trace = ds_read_bts(conf->tracer);
+
+       conf->error = ds_selftest_bts_consistency(trace);
+       if (conf->error < 0)
+               return;
+
+       /* If everything went well, we should have a few trace entries. */
+       if (trace->ds.top == trace->ds.begin) {
+               /*
+                * It is possible but highly unlikely that we got a
+                * buffer overflow and end up at exactly the same
+                * position we started from.
+                * Let's issue a warning, but continue.
+                */
+               printk(KERN_CONT "no trace/overflow...");
+       }
+
+       /* Let's try to read the trace we collected. */
+       conf->error =
+               ds_selftest_bts_read(conf->tracer, trace,
+                                    trace->ds.begin, trace->ds.top);
+       if (conf->error < 0)
+               return;
+
+       /*
+        * Let's read the trace again.
+        * Since we suspended tracing, we should get the same result.
+        */
+       top = trace->ds.top;
+
+       trace = ds_read_bts(conf->tracer);
+       conf->error = ds_selftest_bts_consistency(trace);
+       if (conf->error < 0)
+               return;
+
+       if (top != trace->ds.top) {
+               printk(KERN_CONT "suspend not working...");
+               conf->error = -1;
+               return;
+       }
+
+       /* Let's collect some more trace - see if resume is working. */
+       conf->error = conf->resume(conf->tracer);
+       if (conf->error < 0)
+               return;
+
+       conf->error = conf->suspend(conf->tracer);
+       if (conf->error < 0)
+               return;
+
+       trace = ds_read_bts(conf->tracer);
+
+       conf->error = ds_selftest_bts_consistency(trace);
+       if (conf->error < 0)
+               return;
+
+       if (trace->ds.top == top) {
+               /*
+                * It is possible but highly unlikely that we got a
+                * buffer overflow and end up at exactly the same
+                * position we started from.
+                * Let's issue a warning and check the full trace.
+                */
+               printk(KERN_CONT
+                      "no resume progress/overflow...");
+
+               conf->error =
+                       ds_selftest_bts_read(conf->tracer, trace,
+                                            trace->ds.begin, trace->ds.end);
+       } else if (trace->ds.top < top) {
+               /*
+                * We had a buffer overflow - the entire buffer should
+                * contain trace records.
+                */
+               conf->error =
+                       ds_selftest_bts_read(conf->tracer, trace,
+                                            trace->ds.begin, trace->ds.end);
+       } else {
+               /*
+                * It is quite likely that the buffer did not overflow.
+                * Let's just check the delta trace.
+                */
+               conf->error =
+                       ds_selftest_bts_read(conf->tracer, trace, top,
+                                            trace->ds.top);
+       }
+       if (conf->error < 0)
+               return;
+
+       conf->error = 0;
+}
+
+static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
+{
+       ds_suspend_bts(tracer);
+       return 0;
+}
+
+static int ds_resume_bts_wrap(struct bts_tracer *tracer)
+{
+       ds_resume_bts(tracer);
+       return 0;
+}
+
+static void ds_release_bts_noirq_wrap(void *tracer)
+{
+       (void)ds_release_bts_noirq(tracer);
+}
+
+static int ds_selftest_bts_bad_release_noirq(int cpu,
+                                            struct bts_tracer *tracer)
+{
+       int error = -EPERM;
+
+       /* Try to release the tracer on the wrong cpu. */
+       get_cpu();
+       if (cpu != smp_processor_id()) {
+               error = ds_release_bts_noirq(tracer);
+               if (error != -EPERM)
+                       printk(KERN_CONT "release on wrong cpu...");
+       }
+       put_cpu();
+
+       return error ? 0 : -1;
+}
+
+static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
+{
+       struct bts_tracer *tracer;
+       int error;
+
+       /* Try to request cpu tracing while task tracing is active. */
+       tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
+                                   (size_t)-1, BTS_KERNEL);
+       error = PTR_ERR(tracer);
+       if (!IS_ERR(tracer)) {
+               ds_release_bts(tracer);
+               error = 0;
+       }
+
+       if (error != -EPERM)
+               printk(KERN_CONT "cpu/task tracing overlap...");
+
+       return error ? 0 : -1;
+}
+
+static int ds_selftest_bts_bad_request_task(void *buffer)
+{
+       struct bts_tracer *tracer;
+       int error;
+
+       /* Try to request cpu tracing while task tracing is active. */
+       tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
+                                   (size_t)-1, BTS_KERNEL);
+       error = PTR_ERR(tracer);
+       if (!IS_ERR(tracer)) {
+               error = 0;
+               ds_release_bts(tracer);
+       }
+
+       if (error != -EPERM)
+               printk(KERN_CONT "task/cpu tracing overlap...");
+
+       return error ? 0 : -1;
+}
+
+int ds_selftest_bts(void)
+{
+       struct ds_selftest_bts_conf conf;
+       unsigned char buffer[BUFFER_SIZE], *small_buffer;
+       unsigned long irq;
+       int cpu;
+
+       printk(KERN_INFO "[ds] bts selftest...");
+       conf.error = 0;
+
+       small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu) {
+               conf.suspend = ds_suspend_bts_wrap;
+               conf.resume = ds_resume_bts_wrap;
+               conf.tracer =
+                       ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
+                                          NULL, (size_t)-1, BTS_KERNEL);
+               ds_selftest_bts_cpu(&conf);
+               if (conf.error >= 0)
+                       conf.error = ds_selftest_bts_bad_request_task(buffer);
+               ds_release_bts(conf.tracer);
+               if (conf.error < 0)
+                       goto out;
+
+               conf.suspend = ds_suspend_bts_noirq;
+               conf.resume = ds_resume_bts_noirq;
+               conf.tracer =
+                       ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
+                                          NULL, (size_t)-1, BTS_KERNEL);
+               smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
+               if (conf.error >= 0) {
+                       conf.error =
+                               ds_selftest_bts_bad_release_noirq(cpu,
+                                                                 conf.tracer);
+                       /* We must not release the tracer twice. */
+                       if (conf.error < 0)
+                               conf.tracer = NULL;
+               }
+               if (conf.error >= 0)
+                       conf.error = ds_selftest_bts_bad_request_task(buffer);
+               smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
+                                        conf.tracer, 1);
+               if (conf.error < 0)
+                       goto out;
+       }
+
+       conf.suspend = ds_suspend_bts_wrap;
+       conf.resume = ds_resume_bts_wrap;
+       conf.tracer =
+               ds_request_bts_task(current, buffer, BUFFER_SIZE,
+                                   NULL, (size_t)-1, BTS_KERNEL);
+       ds_selftest_bts_cpu(&conf);
+       if (conf.error >= 0)
+               conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
+       ds_release_bts(conf.tracer);
+       if (conf.error < 0)
+               goto out;
+
+       conf.suspend = ds_suspend_bts_noirq;
+       conf.resume = ds_resume_bts_noirq;
+       conf.tracer =
+               ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
+                                  NULL, (size_t)-1, BTS_KERNEL);
+       local_irq_save(irq);
+       ds_selftest_bts_cpu(&conf);
+       if (conf.error >= 0)
+               conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
+       ds_release_bts_noirq(conf.tracer);
+       local_irq_restore(irq);
+       if (conf.error < 0)
+               goto out;
+
+       conf.error = 0;
+ out:
+       put_online_cpus();
+       printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
+
+       return conf.error;
+}
+
+int ds_selftest_pebs(void)
+{
+       return 0;
+}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h

new file mode 100644 (file)

index 0000000..2ba8745
--- /dev/null
+++ b/arch/x86/kernel/ds_selftest.h
@@ -0,0 +1,15 @@
+/*
+ * Debug Store support - selftest
+ *
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2009
+ */
+
+#ifdef CONFIG_X86_DS_SELFTEST
+extern int ds_selftest_bts(void);
+extern int ds_selftest_pebs(void);
+#else
+static inline int ds_selftest_bts(void) { return 0; }
+static inline int ds_selftest_pebs(void) { return 0; }
+#endif
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h

index da87590b8698a7c4642ad8e46febe2bf91972599..81086c227ab7cafe28c0c608f1b3eb2bce72b1c0 100644 (file)
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
                 unsigned long *sp, unsigned long bp, char *log_lvl);
  
  extern unsigned int code_bytes;
-extern int kstack_depth_to_print;
  
  /* The form of the top of the frame on the stack */
  struct stack_frame {
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c

index 0062813029256a2379d4dc66c42741439227210e..7271fa33d79135edd790f854d7c0c91d05d0c20b 100644 (file)
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
   */
  __init void e820_setup_gap(void)
  {
-       unsigned long gapstart, gapsize, round;
+       unsigned long gapstart, gapsize;
         int found;
  
         gapstart = 0x10000000;
@@ -635,14 +635,9 @@ __init void e820_setup_gap(void)
  #endif
  
         /*
-        * See how much we want to round up: start off with
-        * rounding to the next 1MB area.
+        * e820_reserve_resources_late protect stolen RAM already
          */
-       round = 0x100000;
-       while ((gapsize >> 4) > round)
-               round += round;
-       /* Fun with two's complement */
-       pci_mem_start = (gapstart + round) & -round;
+       pci_mem_start = gapstart;
  
         printk(KERN_INFO
                "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
@@ -1371,6 +1366,23 @@ void __init e820_reserve_resources(void)
         }
  }
  
+/* How much should we pad RAM ending depending on where it is? */
+static unsigned long ram_alignment(resource_size_t pos)
+{
+       unsigned long mb = pos >> 20;
+
+       /* To 64kB in the first megabyte */
+       if (!mb)
+               return 64*1024;
+
+       /* To 1MB in the first 16MB */
+       if (mb < 16)
+               return 1024*1024;
+
+       /* To 32MB for anything above that */
+       return 32*1024*1024;
+}
+
  void __init e820_reserve_resources_late(void)
  {
         int i;
@@ -1382,6 +1394,24 @@ void __init e820_reserve_resources_late(void)
                         insert_resource_expand_to_fit(&iomem_resource, res);
                 res++;
         }
+
+       /*
+        * Try to bump up RAM regions to reasonable boundaries to
+        * avoid stolen RAM:
+        */
+       for (i = 0; i < e820.nr_map; i++) {
+               struct e820entry *entry = &e820_saved.map[i];
+               resource_size_t start, end;
+
+               if (entry->type != E820_RAM)
+                       continue;
+               start = entry->addr + entry->size;
+               end = round_up(start, ram_alignment(start));
+               if (start == end)
+                       continue;
+               reserve_region_with_split(&iomem_resource, start,
+                                                 end - 1, "RAM buffer");
+       }
  }
  
  char *__init default_machine_specific_memory_setup(void)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c

index 76b8cd953deed9f8a50d572cdc52b5edb68bc3b7..ebdb85cf2686fa36702cd4d50b657f22de85b3bd 100644 (file)
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -96,6 +96,7 @@ static void __init nvidia_bugs(int num, int slot, int func)
  
  }
  
+#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
  #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
  static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
  {
@@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
         d &= 0xff;
         return d;
  }
+#endif
  
  static void __init ati_bugs(int num, int slot, int func)
  {
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S

index 38946c6e843388b33fc165779bd7c0aec8e260d7..a4742a340d8d146e71fe8ee684c462b8215839ab 100644 (file)
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -147,27 +147,14 @@ END(ftrace_graph_caller)
  GLOBAL(return_to_handler)
         subq  $80, %rsp
  
+       /* Save the return values */
         movq %rax, (%rsp)
-       movq %rcx, 8(%rsp)
-       movq %rdx, 16(%rsp)
-       movq %rsi, 24(%rsp)
-       movq %rdi, 32(%rsp)
-       movq %r8, 40(%rsp)
-       movq %r9, 48(%rsp)
-       movq %r10, 56(%rsp)
-       movq %r11, 64(%rsp)
+       movq %rdx, 8(%rsp)
  
         call ftrace_return_to_handler
  
         movq %rax, 72(%rsp)
-       movq 64(%rsp), %r11
-       movq 56(%rsp), %r10
-       movq 48(%rsp), %r9
-       movq 40(%rsp), %r8
-       movq 32(%rsp), %rdi
-       movq 24(%rsp), %rsi
-       movq 16(%rsp), %rdx
-       movq 8(%rsp), %rcx
+       movq 8(%rsp), %rdx
         movq (%rsp), %rax
         addq $72, %rsp
         retq
@@ -1025,6 +1012,11 @@ apicinterrupt ERROR_APIC_VECTOR \
  apicinterrupt SPURIOUS_APIC_VECTOR \
         spurious_interrupt smp_spurious_interrupt
  
+#ifdef CONFIG_PERF_COUNTERS
+apicinterrupt LOCAL_PENDING_VECTOR \
+       perf_pending_interrupt smp_perf_pending_interrupt
+#endif
+
  /*
   * Exception entry points.
   */
@@ -1379,6 +1371,11 @@ END(xen_failsafe_callback)
  paranoidzeroentry_ist debug do_debug DEBUG_STACK
  paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
  paranoiderrorentry stack_segment do_stack_segment
+#ifdef CONFIG_XEN
+zeroentry xen_debug do_debug
+zeroentry xen_int3 do_int3
+errorentry xen_stack_segment do_stack_segment
+#endif
  errorentry general_protection do_general_protection
  errorentry page_fault do_page_fault
  #ifdef CONFIG_X86_MCE
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S

index 30683883e0cdcf0bd669cb439206fefaf74301f2..dc5ed4bdd88d36317ba5fc87554d10156a7f9132 100644 (file)
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -608,13 +608,6 @@ ignore_int:
  ENTRY(initial_code)
         .long i386_start_kernel
  
-.section .text
-/*
- * Real beginning of normal "text" segment
- */
-ENTRY(stext)
-ENTRY(_stext)
-
  /*
   * BSS section
   */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c

index c3fe010d74c8e8823793cd9ff816ee02adc8ebde..38287b5f116e10e4e1bba96cac3d0c30bf842890 100644 (file)
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -12,6 +12,7 @@
  #include <asm/io_apic.h>
  #include <asm/irq.h>
  #include <asm/idle.h>
+#include <asm/hw_irq.h>
  
  atomic_t irq_err_count;
  
@@ -24,9 +25,9 @@ void (*generic_interrupt_extension)(void) = NULL;
   */
  void ack_bad_irq(unsigned int irq)
  {
-       printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
+       if (printk_ratelimit())
+               pr_err("unexpected IRQ trap at vector %02x\n", irq);
  
-#ifdef CONFIG_X86_LOCAL_APIC
         /*
          * Currently unexpected vectors happen only on SMP and APIC.
          * We _must_ ack these because every local APIC has only N
@@ -36,9 +37,7 @@ void ack_bad_irq(unsigned int irq)
          * completely.
          * But only ack when the APIC is enabled -AK
          */
-       if (cpu_has_apic)
-               ack_APIC_irq();
-#endif
+       ack_APIC_irq();
  }
  
  #define irq_stats(x)           (&per_cpu(irq_stat, x))
@@ -63,6 +62,14 @@ static int show_other_interrupts(struct seq_file *p, int prec)
         for_each_online_cpu(j)
                 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
         seq_printf(p, "  Spurious interrupts\n");
+       seq_printf(p, "%*s: ", prec, "CNT");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
+       seq_printf(p, "  Performance counter interrupts\n");
+       seq_printf(p, "%*s: ", prec, "PND");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
+       seq_printf(p, "  Performance pending work\n");
  #endif
         if (generic_interrupt_extension) {
                 seq_printf(p, "%*s: ", prec, "PLT");
@@ -166,6 +173,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
  #ifdef CONFIG_X86_LOCAL_APIC
         sum += irq_stats(cpu)->apic_timer_irqs;
         sum += irq_stats(cpu)->irq_spurious_count;
+       sum += irq_stats(cpu)->apic_perf_irqs;
+       sum += irq_stats(cpu)->apic_pending_irqs;
  #endif
         if (generic_interrupt_extension)
                 sum += irq_stats(cpu)->generic_irqs;
@@ -178,7 +187,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
         sum += irq_stats(cpu)->irq_thermal_count;
  # ifdef CONFIG_X86_64
         sum += irq_stats(cpu)->irq_threshold_count;
-#endif
+# endif
  #endif
         return sum;
  }
@@ -213,14 +222,11 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
         irq = __get_cpu_var(vector_irq)[vector];
  
         if (!handle_irq(irq, regs)) {
-#ifdef CONFIG_X86_64
-               if (!disable_apic)
-                       ack_APIC_irq();
-#endif
+               ack_APIC_irq();
  
                 if (printk_ratelimit())
-                       printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n",
-                              __func__, smp_processor_id(), vector, irq);
+                       pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
+                               __func__, smp_processor_id(), vector, irq);
         }
  
         irq_exit();
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit.c

similarity index 68%

rename from arch/x86/kernel/irqinit_32.c

rename to arch/x86/kernel/irqinit.c

index 368b0a8836f902be69531754a72fb75e7b9a4e24..267c6624c77f53dab8be37e59fa97ebd2315d1c5 100644 (file)
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit.c
@@ -1,20 +1,25 @@
+#include <linux/linkage.h>
  #include <linux/errno.h>
  #include <linux/signal.h>
  #include <linux/sched.h>
  #include <linux/ioport.h>
  #include <linux/interrupt.h>
+#include <linux/timex.h>
  #include <linux/slab.h>
  #include <linux/random.h>
+#include <linux/kprobes.h>
  #include <linux/init.h>
  #include <linux/kernel_stat.h>
  #include <linux/sysdev.h>
  #include <linux/bitops.h>
+#include <linux/acpi.h>
  #include <linux/io.h>
  #include <linux/delay.h>
  
  #include <asm/atomic.h>
  #include <asm/system.h>
  #include <asm/timer.h>
+#include <asm/hw_irq.h>
  #include <asm/pgtable.h>
  #include <asm/desc.h>
  #include <asm/apic.h>
@@ -22,7 +27,23 @@
  #include <asm/i8259.h>
  #include <asm/traps.h>
  
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x30-0x3f)
+ */
+
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
  
+#ifdef CONFIG_X86_32
  /*
   * Note that on a 486, we don't want to do a SIGFPE on an irq13
   * as the irq is unreliable, and exception 16 works correctly
@@ -52,30 +73,7 @@ static struct irqaction fpu_irq = {
         .handler = math_error_irq,
         .name = "fpu",
  };
-
-void __init init_ISA_irqs(void)
-{
-       int i;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-       init_bsp_APIC();
  #endif
-       init_8259A(0);
-
-       /*
-        * 16 old-style INTA-cycle interrupts:
-        */
-       for (i = 0; i < NR_IRQS_LEGACY; i++) {
-               struct irq_desc *desc = irq_to_desc(i);
-
-               desc->status = IRQ_DISABLED;
-               desc->action = NULL;
-               desc->depth = 1;
-
-               set_irq_chip_and_handler_name(i, &i8259A_chip,
-                                             handle_level_irq, "XT");
-       }
-}
  
  /*
   * IRQ2 is cascade interrupt to second interrupt controller
@@ -118,29 +116,37 @@ int vector_used_by_percpu_irq(unsigned int vector)
         return 0;
  }
  
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
+static void __init init_ISA_irqs(void)
  {
         int i;
  
-       /* Execute any quirks before the call gates are initialised: */
-       x86_quirk_pre_intr_init();
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
+       init_bsp_APIC();
+#endif
+       init_8259A(0);
  
         /*
-        * Cover the whole vector space, no vector can escape
-        * us. (some of these will be overridden and become
-        * 'special' SMP interrupts)
+        * 16 old-style INTA-cycle interrupts:
          */
-       for (i =  FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
-               /* SYSCALL_VECTOR was reserved in trap_init. */
-               if (i != SYSCALL_VECTOR)
-                       set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+       for (i = 0; i < NR_IRQS_LEGACY; i++) {
+               struct irq_desc *desc = irq_to_desc(i);
+
+               desc->status = IRQ_DISABLED;
+               desc->action = NULL;
+               desc->depth = 1;
+
+               set_irq_chip_and_handler_name(i, &i8259A_chip,
+                                             handle_level_irq, "XT");
         }
+}
  
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
  
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
+static void __init smp_intr_init(void)
+{
+#ifdef CONFIG_SMP
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
         /*
          * The reschedule interrupt is a CPU-to-CPU reschedule-helper
          * IPI, driven by wakeup.
@@ -160,16 +166,32 @@ void __init native_init_IRQ(void)
         /* IPI for generic function call */
         alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
  
-       /* IPI for single call function */
+       /* IPI for generic single function call */
         alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
-                                call_function_single_interrupt);
+                       call_function_single_interrupt);
  
         /* Low priority IPI to cleanup after moving an irq */
         set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
         set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
  #endif
+#endif /* CONFIG_SMP */
+}
+
+static void __init apic_intr_init(void)
+{
+       smp_intr_init();
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+       alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+#endif
+#ifdef CONFIG_X86_THRESHOLD
+       alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
+#endif
+#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
+       alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
+#endif
  
-#ifdef CONFIG_X86_LOCAL_APIC
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
         /* self generated IPI for local APIC timer */
         alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
  
@@ -179,16 +201,59 @@ void __init native_init_IRQ(void)
         /* IPI vectors for APIC spurious and error interrupts */
         alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
         alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+       /* Performance monitoring interrupts: */
+# ifdef CONFIG_PERF_COUNTERS
+       alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
+# endif
+
  #endif
+}
  
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
-       /* thermal monitor LVT interrupt */
-       alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+/**
+ * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
+ *
+ * Description:
+ *     Perform any necessary interrupt initialisation prior to setting up
+ *     the "ordinary" interrupt call gates.  For legacy reasons, the ISA
+ *     interrupts should be initialised here if the machine emulates a PC
+ *     in any way.
+ **/
+static void __init x86_quirk_pre_intr_init(void)
+{
+#ifdef CONFIG_X86_32
+       if (x86_quirks->arch_pre_intr_init) {
+               if (x86_quirks->arch_pre_intr_init())
+                       return;
+       }
  #endif
+       init_ISA_irqs();
+}
+
+void __init native_init_IRQ(void)
+{
+       int i;
+
+       /* Execute any quirks before the call gates are initialised: */
+       x86_quirk_pre_intr_init();
+
+       apic_intr_init();
+
+       /*
+        * Cover the whole vector space, no vector can escape
+        * us. (some of these will be overridden and become
+        * 'special' SMP interrupts)
+        */
+       for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+               /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
+               if (!test_bit(i, used_vectors))
+                       set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+       }
  
         if (!acpi_ioapic)
                 setup_irq(2, &irq2);
  
+#ifdef CONFIG_X86_32
         /*
          * Call quirks after call gates are initialised (usually add in
          * the architecture specific gates):
@@ -203,4 +268,5 @@ void __init native_init_IRQ(void)
                 setup_irq(FPU_IRQ, &fpu_irq);
  
         irq_ctx_init(smp_processor_id());
+#endif
  }
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c

deleted file mode 100644 (file)

index 8cd1053..0000000
--- a/arch/x86/kernel/irqinit_64.c
+++ /dev/null
@@ -1,177 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/timex.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/bitops.h>
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/apic.h>
-#include <asm/i8259.h>
-
-/*
- * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
- * (these are usually mapped to vectors 0x30-0x3f)
- */
-
-/*
- * The IO-APIC gives us many more interrupt sources. Most of these
- * are unused but an SMP system is supposed to have enough memory ...
- * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
- * across the spectrum, so we really want to be prepared to get all
- * of these. Plus, more powerful systems might have more than 64
- * IO-APIC registers.
- *
- * (these are usually mapped into the 0x30-0xff vector range)
- */
-
-/*
- * IRQ2 is cascade interrupt to second interrupt controller
- */
-
-static struct irqaction irq2 = {
-       .handler = no_action,
-       .name = "cascade",
-};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
-       [0 ... IRQ0_VECTOR - 1] = -1,
-       [IRQ0_VECTOR] = 0,
-       [IRQ1_VECTOR] = 1,
-       [IRQ2_VECTOR] = 2,
-       [IRQ3_VECTOR] = 3,
-       [IRQ4_VECTOR] = 4,
-       [IRQ5_VECTOR] = 5,
-       [IRQ6_VECTOR] = 6,
-       [IRQ7_VECTOR] = 7,
-       [IRQ8_VECTOR] = 8,
-       [IRQ9_VECTOR] = 9,
-       [IRQ10_VECTOR] = 10,
-       [IRQ11_VECTOR] = 11,
-       [IRQ12_VECTOR] = 12,
-       [IRQ13_VECTOR] = 13,
-       [IRQ14_VECTOR] = 14,
-       [IRQ15_VECTOR] = 15,
-       [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
-};
-
-int vector_used_by_percpu_irq(unsigned int vector)
-{
-       int cpu;
-
-       for_each_online_cpu(cpu) {
-               if (per_cpu(vector_irq, cpu)[vector] != -1)
-                       return 1;
-       }
-
-       return 0;
-}
-
-static void __init init_ISA_irqs(void)
-{
-       int i;
-
-       init_bsp_APIC();
-       init_8259A(0);
-
-       for (i = 0; i < NR_IRQS_LEGACY; i++) {
-               struct irq_desc *desc = irq_to_desc(i);
-
-               desc->status = IRQ_DISABLED;
-               desc->action = NULL;
-               desc->depth = 1;
-
-               /*
-                * 16 old-style INTA-cycle interrupts:
-                */
-               set_irq_chip_and_handler_name(i, &i8259A_chip,
-                                                     handle_level_irq, "XT");
-       }
-}
-
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-static void __init smp_intr_init(void)
-{
-#ifdef CONFIG_SMP
-       /*
-        * The reschedule interrupt is a CPU-to-CPU reschedule-helper
-        * IPI, driven by wakeup.
-        */
-       alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
-       /* IPIs for invalidation */
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
-
-       /* IPI for generic function call */
-       alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
-       /* IPI for generic single function call */
-       alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
-                       call_function_single_interrupt);
-
-       /* Low priority IPI to cleanup after moving an irq */
-       set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
-       set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
-#endif
-}
-
-static void __init apic_intr_init(void)
-{
-       smp_intr_init();
-
-       alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-       alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-
-       /* self generated IPI for local APIC timer */
-       alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
-       /* generic IPI for platform specific use */
-       alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
-
-       /* IPI vectors for APIC spurious and error interrupts */
-       alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-       alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-}
-
-void __init native_init_IRQ(void)
-{
-       int i;
-
-       init_ISA_irqs();
-       /*
-        * Cover the whole vector space, no vector can escape
-        * us. (some of these will be overridden and become
-        * 'special' SMP interrupts)
-        */
-       for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
-               int vector = FIRST_EXTERNAL_VECTOR + i;
-               if (vector != IA32_SYSCALL_VECTOR)
-                       set_intr_gate(vector, interrupt[i]);
-       }
-
-       apic_intr_init();
-
-       if (!acpi_ioapic)
-               setup_irq(2, &irq2);
-}
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c

index b1f4dffb919e8c708421cb8c29af80ad50d6a7c9..8d82a77a3f3b96ea3c0dc37e91551dddc7e10b51 100644 (file)
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -142,7 +142,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
         gdb_regs32[GDB_PS]      = *(unsigned long *)(p->thread.sp + 8);
         gdb_regs32[GDB_CS]      = __KERNEL_CS;
         gdb_regs32[GDB_SS]      = __KERNEL_DS;
-       gdb_regs[GDB_PC]        = p->thread.ip;
+       gdb_regs[GDB_PC]        = 0;
         gdb_regs[GDB_R8]        = 0;
         gdb_regs[GDB_R9]        = 0;
         gdb_regs[GDB_R10]       = 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c

index 33019ddb56b452ce736f4b70fcb5d4efe8d0ae9d..6551dedee20c71a58a8f902bdee88dd0e8c4d7f8 100644 (file)
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -195,7 +195,7 @@ static void kvm_leave_lazy_mmu(void)
         struct kvm_para_state *state = kvm_para_state();
  
         mmu_queue_flush(state);
-       paravirt_leave_lazy(paravirt_get_lazy_mode());
+       paravirt_leave_lazy_mmu();
         state->mode = paravirt_get_lazy_mode();
  }
  
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c

index 453b5795a5c6a23817f8e833f722d6140c6be644..366baa179913dfc3e3e865480e82008a9ab5071a 100644 (file)
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,25 +13,13 @@
   *  Licensed under the terms of the GNU General Public
   *  License version 2. See file COPYING for details.
   */
-#include <linux/platform_device.h>
-#include <linux/capability.h>
-#include <linux/miscdevice.h>
  #include <linux/firmware.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
  #include <linux/pci_ids.h>
  #include <linux/uaccess.h>
  #include <linux/vmalloc.h>
  #include <linux/kernel.h>
  #include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
  #include <linux/pci.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
  
  #include <asm/microcode.h>
  #include <asm/processor.h>
@@ -79,9 +67,6 @@ struct microcode_amd {
  #define UCODE_CONTAINER_SECTION_HDR    8
  #define UCODE_CONTAINER_HEADER_SIZE    12
  
-/* serialize access to the physical write */
-static DEFINE_SPINLOCK(microcode_update_lock);
-
  static struct equiv_cpu_entry *equiv_cpu_table;
  
  static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
@@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
         return 1;
  }
  
-static void apply_microcode_amd(int cpu)
+static int apply_microcode_amd(int cpu)
  {
-       unsigned long flags;
         u32 rev, dummy;
         int cpu_num = raw_smp_processor_id();
         struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
@@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu)
         BUG_ON(cpu_num != cpu);
  
         if (mc_amd == NULL)
-               return;
+               return 0;
  
-       spin_lock_irqsave(&microcode_update_lock, flags);
         wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
         /* get patch id after patching */
         rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-       spin_unlock_irqrestore(&microcode_update_lock, flags);
  
         /* check current patch id and patch's id for match */
         if (rev != mc_amd->hdr.patch_id) {
                 printk(KERN_ERR "microcode: CPU%d: update failed "
                        "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
-               return;
+               return -1;
         }
  
         printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
                cpu, rev);
  
         uci->cpu_sig.rev = rev;
+
+       return 0;
  }
  
  static int get_ucode_data(void *to, const u8 *from, size_t n)
@@ -257,13 +241,12 @@ static int install_equiv_cpu_table(const u8 *buf)
  
  static void free_equiv_cpu_table(void)
  {
-       if (equiv_cpu_table) {
-               vfree(equiv_cpu_table);
-               equiv_cpu_table = NULL;
-       }
+       vfree(equiv_cpu_table);
+       equiv_cpu_table = NULL;
  }
  
-static int generic_load_microcode(int cpu, const u8 *data, size_t size)
+static enum ucode_state
+generic_load_microcode(int cpu, const u8 *data, size_t size)
  {
         struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
         const u8 *ucode_ptr = data;
@@ -272,12 +255,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
         int new_rev = uci->cpu_sig.rev;
         unsigned int leftover;
         unsigned long offset;
+       enum ucode_state state = UCODE_OK;
  
         offset = install_equiv_cpu_table(ucode_ptr);
         if (!offset) {
                 printk(KERN_ERR "microcode: failed to create "
                        "equivalent cpu table\n");
-               return -EINVAL;
+               return UCODE_ERROR;
         }
  
         ucode_ptr += offset;
@@ -293,8 +277,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
  
                 mc_header = (struct microcode_header_amd *)mc;
                 if (get_matching_microcode(cpu, mc, new_rev)) {
-                       if (new_mc)
-                               vfree(new_mc);
+                       vfree(new_mc);
                         new_rev = mc_header->patch_id;
                         new_mc  = mc;
                 } else
@@ -306,34 +289,32 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
  
         if (new_mc) {
                 if (!leftover) {
-                       if (uci->mc)
-                               vfree(uci->mc);
+                       vfree(uci->mc);
                         uci->mc = new_mc;
                         pr_debug("microcode: CPU%d found a matching microcode "
                                  "update with version 0x%x (current=0x%x)\n",
                                  cpu, new_rev, uci->cpu_sig.rev);
-               } else
+               } else {
                         vfree(new_mc);
-       }
+                       state = UCODE_ERROR;
+               }
+       } else
+               state = UCODE_NFOUND;
  
         free_equiv_cpu_table();
  
-       return (int)leftover;
+       return state;
  }
  
-static int request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
  {
         const char *fw_name = "amd-ucode/microcode_amd.bin";
         const struct firmware *firmware;
-       int ret;
-
-       /* We should bind the task to the CPU */
-       BUG_ON(cpu != raw_smp_processor_id());
+       enum ucode_state ret;
  
-       ret = request_firmware(&firmware, fw_name, device);
-       if (ret) {
+       if (request_firmware(&firmware, fw_name, device)) {
                 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
-               return ret;
+               return UCODE_NFOUND;
         }
  
         ret = generic_load_microcode(cpu, firmware->data, firmware->size);
@@ -343,11 +324,12 @@ static int request_microcode_fw(int cpu, struct device *device)
         return ret;
  }
  
-static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
  {
         printk(KERN_INFO "microcode: AMD microcode update via "
                "/dev/cpu/microcode not supported\n");
-       return -1;
+       return UCODE_ERROR;
  }
  
  static void microcode_fini_cpu_amd(int cpu)
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c

index 98c470c069d150f26af3125a7690a25689614362..9c4461501fcbb9618ac3ce12fef93f990b8f3955 100644 (file)
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -71,27 +71,18 @@
   *             Thanks to Stuart Swales for pointing out this bug.
   */
  #include <linux/platform_device.h>
-#include <linux/capability.h>
  #include <linux/miscdevice.h>
-#include <linux/firmware.h>
+#include <linux/capability.h>
  #include <linux/smp_lock.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
  #include <linux/kernel.h>
  #include <linux/module.h>
  #include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
  #include <linux/cpu.h>
  #include <linux/fs.h>
  #include <linux/mm.h>
  
  #include <asm/microcode.h>
  #include <asm/processor.h>
-#include <asm/msr.h>
  
  MODULE_DESCRIPTION("Microcode Update Driver");
  MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -101,36 +92,110 @@ MODULE_LICENSE("GPL");
  
  static struct microcode_ops    *microcode_ops;
  
-/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
+/*
+ * Synchronization.
+ *
+ * All non cpu-hotplug-callback call sites use:
+ *
+ * - microcode_mutex to synchronize with each other;
+ * - get/put_online_cpus() to synchronize with
+ *   the cpu-hotplug-callback call sites.
+ *
+ * We guarantee that only a single cpu is being
+ * updated at any particular moment of time.
+ */
  static DEFINE_MUTEX(microcode_mutex);
  
  struct ucode_cpu_info          ucode_cpu_info[NR_CPUS];
  EXPORT_SYMBOL_GPL(ucode_cpu_info);
  
+/*
+ * Operations that are run on a target cpu:
+ */
+
+struct cpu_info_ctx {
+       struct cpu_signature    *cpu_sig;
+       int                     err;
+};
+
+static void collect_cpu_info_local(void *arg)
+{
+       struct cpu_info_ctx *ctx = arg;
+
+       ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
+                                                  ctx->cpu_sig);
+}
+
+static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
+{
+       struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
+       int ret;
+
+       ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
+       if (!ret)
+               ret = ctx.err;
+
+       return ret;
+}
+
+static int collect_cpu_info(int cpu)
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+       int ret;
+
+       memset(uci, 0, sizeof(*uci));
+
+       ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
+       if (!ret)
+               uci->valid = 1;
+
+       return ret;
+}
+
+struct apply_microcode_ctx {
+       int err;
+};
+
+static void apply_microcode_local(void *arg)
+{
+       struct apply_microcode_ctx *ctx = arg;
+
+       ctx->err = microcode_ops->apply_microcode(smp_processor_id());
+}
+
+static int apply_microcode_on_target(int cpu)
+{
+       struct apply_microcode_ctx ctx = { .err = 0 };
+       int ret;
+
+       ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
+       if (!ret)
+               ret = ctx.err;
+
+       return ret;
+}
+
  #ifdef CONFIG_MICROCODE_OLD_INTERFACE
  static int do_microcode_update(const void __user *buf, size_t size)
  {
-       cpumask_t old;
         int error = 0;
         int cpu;
  
-       old = current->cpus_allowed;
-
         for_each_online_cpu(cpu) {
                 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+               enum ucode_state ustate;
  
                 if (!uci->valid)
                         continue;
  
-               set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-               error = microcode_ops->request_microcode_user(cpu, buf, size);
-               if (error < 0)
-                       goto out;
-               if (!error)
-                       microcode_ops->apply_microcode(cpu);
+               ustate = microcode_ops->request_microcode_user(cpu, buf, size);
+               if (ustate == UCODE_ERROR) {
+                       error = -1;
+                       break;
+               } else if (ustate == UCODE_OK)
+                       apply_microcode_on_target(cpu);
         }
-out:
-       set_cpus_allowed_ptr(current, &old);
+
         return error;
  }
  
@@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2)
  static ssize_t microcode_write(struct file *file, const char __user *buf,
                                size_t len, loff_t *ppos)
  {
-       ssize_t ret;
+       ssize_t ret = -EINVAL;
  
         if ((len >> PAGE_SHIFT) > num_physpages) {
-               printk(KERN_ERR "microcode: too much data (max %ld pages)\n",
-                      num_physpages);
-               return -EINVAL;
+               pr_err("microcode: too much data (max %ld pages)\n", num_physpages);
+               return ret;
         }
  
         get_online_cpus();
         mutex_lock(&microcode_mutex);
  
-       ret = do_microcode_update(buf, len);
-       if (!ret)
+       if (do_microcode_update(buf, len) == 0)
                 ret = (ssize_t)len;
  
         mutex_unlock(&microcode_mutex);
@@ -165,15 +228,15 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
  }
  
  static const struct file_operations microcode_fops = {
-       .owner          = THIS_MODULE,
-       .write          = microcode_write,
-       .open           = microcode_open,
+       .owner                  = THIS_MODULE,
+       .write                  = microcode_write,
+       .open                   = microcode_open,
  };
  
  static struct miscdevice microcode_dev = {
-       .minor          = MICROCODE_MINOR,
-       .name           = "microcode",
-       .fops           = &microcode_fops,
+       .minor                  = MICROCODE_MINOR,
+       .name                   = "microcode",
+       .fops                   = &microcode_fops,
  };
  
  static int __init microcode_dev_init(void)
@@ -182,9 +245,7 @@ static int __init microcode_dev_init(void)
  
         error = misc_register(&microcode_dev);
         if (error) {
-               printk(KERN_ERR
-                       "microcode: can't misc_register on minor=%d\n",
-                       MICROCODE_MINOR);
+               pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR);
                 return error;
         }
  
@@ -205,42 +266,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
  /* fake device for request_firmware */
  static struct platform_device  *microcode_pdev;
  
-static long reload_for_cpu(void *unused)
+static int reload_for_cpu(int cpu)
  {
-       struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
         int err = 0;
  
         mutex_lock(&microcode_mutex);
         if (uci->valid) {
-               err = microcode_ops->request_microcode_fw(smp_processor_id(),
-                                                         &microcode_pdev->dev);
-               if (!err)
-                       microcode_ops->apply_microcode(smp_processor_id());
+               enum ucode_state ustate;
+
+               ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
+               if (ustate == UCODE_OK)
+                       apply_microcode_on_target(cpu);
+               else
+                       if (ustate == UCODE_ERROR)
+                               err = -EINVAL;
         }
         mutex_unlock(&microcode_mutex);
+
         return err;
  }
  
  static ssize_t reload_store(struct sys_device *dev,
                             struct sysdev_attribute *attr,
-                           const char *buf, size_t sz)
+                           const char *buf, size_t size)
  {
-       char *end;
-       unsigned long val = simple_strtoul(buf, &end, 0);
-       int err = 0;
+       unsigned long val;
         int cpu = dev->id;
+       int ret = 0;
+       char *end;
  
+       val = simple_strtoul(buf, &end, 0);
         if (end == buf)
                 return -EINVAL;
+
         if (val == 1) {
                 get_online_cpus();
                 if (cpu_online(cpu))
-                       err = work_on_cpu(cpu, reload_for_cpu, NULL);
+                       ret = reload_for_cpu(cpu);
                 put_online_cpus();
         }
-       if (err)
-               return err;
-       return sz;
+
+       if (!ret)
+               ret = size;
+
+       return ret;
  }
  
  static ssize_t version_show(struct sys_device *dev,
@@ -271,11 +341,11 @@ static struct attribute *mc_default_attrs[] = {
  };
  
  static struct attribute_group mc_attr_group = {
-       .attrs          = mc_default_attrs,
-       .name           = "microcode",
+       .attrs                  = mc_default_attrs,
+       .name                   = "microcode",
  };
  
-static void __microcode_fini_cpu(int cpu)
+static void microcode_fini_cpu(int cpu)
  {
         struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
  
@@ -283,103 +353,68 @@ static void __microcode_fini_cpu(int cpu)
         uci->valid = 0;
  }
  
-static void microcode_fini_cpu(int cpu)
-{
-       mutex_lock(&microcode_mutex);
-       __microcode_fini_cpu(cpu);
-       mutex_unlock(&microcode_mutex);
-}
-
-static void collect_cpu_info(int cpu)
+static enum ucode_state microcode_resume_cpu(int cpu)
  {
         struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
  
-       memset(uci, 0, sizeof(*uci));
-       if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig))
-               uci->valid = 1;
+       if (!uci->mc)
+               return UCODE_NFOUND;
+
+       pr_debug("microcode: CPU%d updated upon resume\n", cpu);
+       apply_microcode_on_target(cpu);
+
+       return UCODE_OK;
  }
  
-static int microcode_resume_cpu(int cpu)
+static enum ucode_state microcode_init_cpu(int cpu)
  {
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-       struct cpu_signature nsig;
+       enum ucode_state ustate;
  
-       pr_debug("microcode: CPU%d resumed\n", cpu);
+       if (collect_cpu_info(cpu))
+               return UCODE_ERROR;
  
-       if (!uci->mc)
-               return 1;
+       /* --dimm. Trigger a delayed update? */
+       if (system_state != SYSTEM_RUNNING)
+               return UCODE_NFOUND;
  
-       /*
-        * Let's verify that the 'cached' ucode does belong
-        * to this cpu (a bit of paranoia):
-        */
-       if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
-               __microcode_fini_cpu(cpu);
-               printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n",
-                               cpu);
-               return -1;
-       }
+       ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
  
-       if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) {
-               __microcode_fini_cpu(cpu);
-               printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n",
-                               cpu);
-               /* Should we look for a new ucode here? */
-               return 1;
+       if (ustate == UCODE_OK) {
+               pr_debug("microcode: CPU%d updated upon init\n", cpu);
+               apply_microcode_on_target(cpu);
         }
  
-       return 0;
+       return ustate;
  }
  
-static long microcode_update_cpu(void *unused)
+static enum ucode_state microcode_update_cpu(int cpu)
  {
-       struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
-       int err = 0;
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+       enum ucode_state ustate;
  
-       /*
-        * Check if the system resume is in progress (uci->valid != NULL),
-        * otherwise just request a firmware:
-        */
-       if (uci->valid) {
-               err = microcode_resume_cpu(smp_processor_id());
-       } else {
-               collect_cpu_info(smp_processor_id());
-               if (uci->valid && system_state == SYSTEM_RUNNING)
-                       err = microcode_ops->request_microcode_fw(
-                                       smp_processor_id(),
-                                       &microcode_pdev->dev);
-       }
-       if (!err)
-               microcode_ops->apply_microcode(smp_processor_id());
-       return err;
-}
+       if (uci->valid)
+               ustate = microcode_resume_cpu(cpu);
+       else
+               ustate = microcode_init_cpu(cpu);
  
-static int microcode_init_cpu(int cpu)
-{
-       int err;
-       mutex_lock(&microcode_mutex);
-       err = work_on_cpu(cpu, microcode_update_cpu, NULL);
-       mutex_unlock(&microcode_mutex);
-
-       return err;
+       return ustate;
  }
  
  static int mc_sysdev_add(struct sys_device *sys_dev)
  {
         int err, cpu = sys_dev->id;
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
  
         if (!cpu_online(cpu))
                 return 0;
  
         pr_debug("microcode: CPU%d added\n", cpu);
-       memset(uci, 0, sizeof(*uci));
  
         err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
         if (err)
                 return err;
  
-       err = microcode_init_cpu(cpu);
+       if (microcode_init_cpu(cpu) == UCODE_ERROR)
+               err = -EINVAL;
  
         return err;
  }
@@ -400,19 +435,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
  static int mc_sysdev_resume(struct sys_device *dev)
  {
         int cpu = dev->id;
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
  
         if (!cpu_online(cpu))
                 return 0;
  
-       /* only CPU 0 will apply ucode here */
-       microcode_update_cpu(NULL);
+       /*
+        * All non-bootup cpus are still disabled,
+        * so only CPU 0 will apply ucode here.
+        *
+        * Moreover, there can be no concurrent
+        * updates from any other places at this point.
+        */
+       WARN_ON(cpu != 0);
+
+       if (uci->valid && uci->mc)
+               microcode_ops->apply_microcode(cpu);
+
         return 0;
  }
  
  static struct sysdev_driver mc_sysdev_driver = {
-       .add            = mc_sysdev_add,
-       .remove         = mc_sysdev_remove,
-       .resume         = mc_sysdev_resume,
+       .add                    = mc_sysdev_add,
+       .remove                 = mc_sysdev_remove,
+       .resume                 = mc_sysdev_resume,
  };
  
  static __cpuinit int
@@ -425,15 +471,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
         switch (action) {
         case CPU_ONLINE:
         case CPU_ONLINE_FROZEN:
-               if (microcode_init_cpu(cpu))
-                       printk(KERN_ERR "microcode: failed to init CPU%d\n",
-                              cpu);
+               microcode_update_cpu(cpu);
         case CPU_DOWN_FAILED:
         case CPU_DOWN_FAILED_FROZEN:
                 pr_debug("microcode: CPU%d added\n", cpu);
                 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
-                       printk(KERN_ERR "microcode: Failed to create the sysfs "
-                               "group for CPU%d\n", cpu);
+                       pr_err("microcode: Failed to create group for CPU%d\n", cpu);
                 break;
         case CPU_DOWN_PREPARE:
         case CPU_DOWN_PREPARE_FROZEN:
@@ -465,13 +508,10 @@ static int __init microcode_init(void)
                 microcode_ops = init_amd_microcode();
  
         if (!microcode_ops) {
-               printk(KERN_ERR "microcode: no support for this CPU vendor\n");
+               pr_err("microcode: no support for this CPU vendor\n");
                 return -ENODEV;
         }
  
-       error = microcode_dev_init();
-       if (error)
-               return error;
         microcode_pdev = platform_device_register_simple("microcode", -1,
                                                          NULL, 0);
         if (IS_ERR(microcode_pdev)) {
@@ -480,23 +520,31 @@ static int __init microcode_init(void)
         }
  
         get_online_cpus();
+       mutex_lock(&microcode_mutex);
+
         error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+
+       mutex_unlock(&microcode_mutex);
         put_online_cpus();
+
         if (error) {
-               microcode_dev_exit();
                 platform_device_unregister(microcode_pdev);
                 return error;
         }
  
+       error = microcode_dev_init();
+       if (error)
+               return error;
+
         register_hotcpu_notifier(&mc_cpu_notifier);
  
-       printk(KERN_INFO
-              "Microcode Update Driver: v" MICROCODE_VERSION
+       pr_info("Microcode Update Driver: v" MICROCODE_VERSION
                " <tigran@aivazian.fsnet.co.uk>,"
                " Peter Oruba\n");
  
         return 0;
  }
+module_init(microcode_init);
  
  static void __exit microcode_exit(void)
  {
@@ -505,16 +553,17 @@ static void __exit microcode_exit(void)
         unregister_hotcpu_notifier(&mc_cpu_notifier);
  
         get_online_cpus();
+       mutex_lock(&microcode_mutex);
+
         sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+
+       mutex_unlock(&microcode_mutex);
         put_online_cpus();
  
         platform_device_unregister(microcode_pdev);
  
         microcode_ops = NULL;
  
-       printk(KERN_INFO
-              "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
+       pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
  }
-
-module_init(microcode_init);
  module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c

index 149b9ec7c1ab72fcd20c2f2ab84cc511b3d707e7..0d334ddd0a9642a2c05485e9e31394f8b1e98a3b 100644 (file)
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,24 +70,11 @@
   *             Fix sigmatch() macro to handle old CPUs with pf == 0.
   *             Thanks to Stuart Swales for pointing out this bug.
   */
-#include <linux/platform_device.h>
-#include <linux/capability.h>
-#include <linux/miscdevice.h>
  #include <linux/firmware.h>
-#include <linux/smp_lock.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
  #include <linux/uaccess.h>
-#include <linux/vmalloc.h>
  #include <linux/kernel.h>
  #include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
+#include <linux/vmalloc.h>
  
  #include <asm/microcode.h>
  #include <asm/processor.h>
@@ -150,13 +137,9 @@ struct extended_sigtable {
  
  #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
  
-/* serialize access to the physical write to MSR 0x79 */
-static DEFINE_SPINLOCK(microcode_update_lock);
-
  static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
  {
         struct cpuinfo_x86 *c = &cpu_data(cpu_num);
-       unsigned long flags;
         unsigned int val[2];
  
         memset(csig, 0, sizeof(*csig));
@@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
                 csig->pf = 1 << ((val[1] >> 18) & 7);
         }
  
-       /* serialize access to the physical write to MSR 0x79 */
-       spin_lock_irqsave(&microcode_update_lock, flags);
-
         wrmsr(MSR_IA32_UCODE_REV, 0, 0);
         /* see notes above for revision 1.07.  Apparent chip bug */
         sync_core();
         /* get the current revision from MSR 0x8B */
         rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
-       spin_unlock_irqrestore(&microcode_update_lock, flags);
  
-       pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
-                       csig->sig, csig->pf, csig->rev);
+       printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
+                       cpu_num, csig->sig, csig->pf, csig->rev);
  
         return 0;
  }
@@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
         return 0;
  }
  
-static void apply_microcode(int cpu)
+static int apply_microcode(int cpu)
  {
         struct microcode_intel *mc_intel;
         struct ucode_cpu_info *uci;
-       unsigned long flags;
         unsigned int val[2];
         int cpu_num;
  
@@ -334,10 +312,7 @@ static void apply_microcode(int cpu)
         BUG_ON(cpu_num != cpu);
  
         if (mc_intel == NULL)
-               return;
-
-       /* serialize access to the physical write to MSR 0x79 */
-       spin_lock_irqsave(&microcode_update_lock, flags);
+               return 0;
  
         /* write microcode via MSR 0x79 */
         wrmsr(MSR_IA32_UCODE_WRITE,
@@ -351,30 +326,32 @@ static void apply_microcode(int cpu)
         /* get the current revision from MSR 0x8B */
         rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
  
-       spin_unlock_irqrestore(&microcode_update_lock, flags);
         if (val[1] != mc_intel->hdr.rev) {
-               printk(KERN_ERR "microcode: CPU%d update from revision "
-                               "0x%x to 0x%x failed\n",
-                       cpu_num, uci->cpu_sig.rev, val[1]);
-               return;
+               printk(KERN_ERR "microcode: CPU%d update "
+                               "to revision 0x%x failed\n",
+                       cpu_num, mc_intel->hdr.rev);
+               return -1;
         }
-       printk(KERN_INFO "microcode: CPU%d updated from revision "
-                        "0x%x to 0x%x, date = %04x-%02x-%02x \n",
-               cpu_num, uci->cpu_sig.rev, val[1],
+       printk(KERN_INFO "microcode: CPU%d updated to revision "
+                        "0x%x, date = %04x-%02x-%02x \n",
+               cpu_num, val[1],
                 mc_intel->hdr.date & 0xffff,
                 mc_intel->hdr.date >> 24,
                 (mc_intel->hdr.date >> 16) & 0xff);
  
         uci->cpu_sig.rev = val[1];
+
+       return 0;
  }
  
-static int generic_load_microcode(int cpu, void *data, size_t size,
-               int (*get_ucode_data)(void *, const void *, size_t))
+static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
+                               int (*get_ucode_data)(void *, const void *, size_t))
  {
         struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
         u8 *ucode_ptr = data, *new_mc = NULL, *mc;
         int new_rev = uci->cpu_sig.rev;
         unsigned int leftover = size;
+       enum ucode_state state = UCODE_OK;
  
         while (leftover) {
                 struct microcode_header_intel mc_header;
@@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
                 leftover  -= mc_size;
         }
  
-       if (!new_mc)
+       if (leftover) {
+               if (new_mc)
+                       vfree(new_mc);
+               state = UCODE_ERROR;
                 goto out;
+       }
  
-       if (leftover) {
-               vfree(new_mc);
+       if (!new_mc) {
+               state = UCODE_NFOUND;
                 goto out;
         }
  
@@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
         pr_debug("microcode: CPU%d found a matching microcode update with"
                  " version 0x%x (current=0x%x)\n",
                         cpu, new_rev, uci->cpu_sig.rev);
-
- out:
-       return (int)leftover;
+out:
+       return state;
  }
  
  static int get_ucode_fw(void *to, const void *from, size_t n)
@@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
         return 0;
  }
  
-static int request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
  {
         char name[30];
         struct cpuinfo_x86 *c = &cpu_data(cpu);
         const struct firmware *firmware;
-       int ret;
+       enum ucode_state ret;
  
-       /* We should bind the task to the CPU */
-       BUG_ON(cpu != raw_smp_processor_id());
         sprintf(name, "intel-ucode/%02x-%02x-%02x",
                 c->x86, c->x86_model, c->x86_mask);
-       ret = request_firmware(&firmware, name, device);
-       if (ret) {
+
+       if (request_firmware(&firmware, name, device)) {
                 pr_debug("microcode: data file %s load failed\n", name);
-               return ret;
+               return UCODE_NFOUND;
         }
  
         ret = generic_load_microcode(cpu, (void *)firmware->data,
@@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n)
         return copy_from_user(to, from, n);
  }
  
-static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
  {
-       /* We should bind the task to the CPU */
-       BUG_ON(cpu != raw_smp_processor_id());
-
         return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
  }
  
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c

index 70fd7e414c1544aadf5de5573e67294e88cac16c..651c93b28862a5ea04f3fdfc883533ff843b2583 100644 (file)
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -17,6 +17,7 @@
  #include <linux/acpi.h>
  #include <linux/module.h>
  #include <linux/smp.h>
+#include <linux/pci.h>
  
  #include <asm/mtrr.h>
  #include <asm/mpspec.h>
@@ -870,24 +871,17 @@ static
  inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
  #endif /* CONFIG_X86_IO_APIC */
  
-static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length,
-                     int count)
+static int
+check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
  {
-       if (!mpc_new_phys) {
-               pr_info("No spare slots, try to append...take your risk, "
-                       "new mpc_length %x\n", count);
-       } else {
-               if (count <= mpc_new_length)
-                       pr_info("No spare slots, try to append..., "
-                               "new mpc_length %x\n", count);
-               else {
-                       pr_err("mpc_new_length %lx is too small\n",
-                               mpc_new_length);
-                       return -1;
-               }
+       int ret = 0;
+
+       if (!mpc_new_phys || count <= mpc_new_length) {
+               WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
+               return -1;
         }
  
-       return 0;
+       return ret;
  }
  
  static int  __init replace_intsrc_all(struct mpc_table *mpc,
@@ -946,7 +940,7 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,
                 } else {
                         struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
                         count += sizeof(struct mpc_intsrc);
-                       if (!check_slot(mpc_new_phys, mpc_new_length, count))
+                       if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
                                 goto out;
                         assign_to_mpc_intsrc(&mp_irqs[i], m);
                         mpc->length = count;
@@ -963,11 +957,14 @@ out:
         return 0;
  }
  
-static int __initdata enable_update_mptable;
+int enable_update_mptable;
  
  static int __init update_mptable_setup(char *str)
  {
         enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+       pci_routeirq = 1;
+#endif
         return 0;
  }
  early_param("update_mptable", update_mptable_setup);
@@ -980,6 +977,9 @@ static int __initdata alloc_mptable;
  static int __init parse_alloc_mptable_opt(char *p)
  {
         enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+       pci_routeirq = 1;
+#endif
         alloc_mptable = 1;
         if (!p)
                 return 0;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c

index 9faf43bea3361cf178f53942e8f37a8842737afc..70ec9b951d76e0eb21a79f124b9adebe8a4b60be 100644 (file)
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -248,18 +248,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
  
  static inline void enter_lazy(enum paravirt_lazy_mode mode)
  {
-       BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
-       BUG_ON(preemptible());
+       BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
  
-       __get_cpu_var(paravirt_lazy_mode) = mode;
+       percpu_write(paravirt_lazy_mode, mode);
  }
  
-void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
+static void leave_lazy(enum paravirt_lazy_mode mode)
  {
-       BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode);
-       BUG_ON(preemptible());
+       BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
  
-       __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
+       percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
  }
  
  void paravirt_enter_lazy_mmu(void)
@@ -269,22 +267,36 @@ void paravirt_enter_lazy_mmu(void)
  
  void paravirt_leave_lazy_mmu(void)
  {
-       paravirt_leave_lazy(PARAVIRT_LAZY_MMU);
+       leave_lazy(PARAVIRT_LAZY_MMU);
  }
  
-void paravirt_enter_lazy_cpu(void)
+void paravirt_start_context_switch(struct task_struct *prev)
  {
+       BUG_ON(preemptible());
+
+       if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
+               arch_leave_lazy_mmu_mode();
+               set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
+       }
         enter_lazy(PARAVIRT_LAZY_CPU);
  }
  
-void paravirt_leave_lazy_cpu(void)
+void paravirt_end_context_switch(struct task_struct *next)
  {
-       paravirt_leave_lazy(PARAVIRT_LAZY_CPU);
+       BUG_ON(preemptible());
+
+       leave_lazy(PARAVIRT_LAZY_CPU);
+
+       if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
+               arch_enter_lazy_mmu_mode();
  }
  
  enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
  {
-       return __get_cpu_var(paravirt_lazy_mode);
+       if (in_interrupt())
+               return PARAVIRT_LAZY_NONE;
+
+       return percpu_read(paravirt_lazy_mode);
  }
  
  void arch_flush_lazy_mmu_mode(void)
@@ -292,7 +304,6 @@ void arch_flush_lazy_mmu_mode(void)
         preempt_disable();
  
         if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
-               WARN_ON(preempt_count() == 1);
                 arch_leave_lazy_mmu_mode();
                 arch_enter_lazy_mmu_mode();
         }
@@ -300,19 +311,6 @@ void arch_flush_lazy_mmu_mode(void)
         preempt_enable();
  }
  
-void arch_flush_lazy_cpu_mode(void)
-{
-       preempt_disable();
-
-       if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
-               WARN_ON(preempt_count() == 1);
-               arch_leave_lazy_cpu_mode();
-               arch_enter_lazy_cpu_mode();
-       }
-
-       preempt_enable();
-}
-
  struct pv_info pv_info = {
         .name = "bare hardware",
         .paravirt_enabled = 0,
@@ -404,10 +402,8 @@ struct pv_cpu_ops pv_cpu_ops = {
         .set_iopl_mask = native_set_iopl_mask,
         .io_delay = native_io_delay,
  
-       .lazy_mode = {
-               .enter = paravirt_nop,
-               .leave = paravirt_nop,
-       },
+       .start_context_switch = paravirt_nop,
+       .end_context_switch = paravirt_nop,
  };
  
  struct pv_apic_ops pv_apic_ops = {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c

index 755c21e906f3f04821755c703bb99e26b118e0f7..971a3bec47a8644fdd3cbbcebb833a0ad2cc3044 100644 (file)
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = {
  
  static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
  
-/* enable this to stress test the chip's TCE cache */
-#ifdef CONFIG_IOMMU_DEBUG
-static int debugging = 1;
-
-static inline unsigned long verify_bit_range(unsigned long* bitmap,
-       int expected, unsigned long start, unsigned long end)
-{
-       unsigned long idx = start;
-
-       BUG_ON(start >= end);
-
-       while (idx < end) {
-               if (!!test_bit(idx, bitmap) != expected)
-                       return idx;
-               ++idx;
-       }
-
-       /* all bits have the expected value */
-       return ~0UL;
-}
-#else /* debugging is disabled */
-static int debugging;
-
-static inline unsigned long verify_bit_range(unsigned long* bitmap,
-       int expected, unsigned long start, unsigned long end)
-{
-       return ~0UL;
-}
-
-#endif /* CONFIG_IOMMU_DEBUG */
-
  static inline int translation_enabled(struct iommu_table *tbl)
  {
         /* only PHBs with translation enabled have an IOMMU table */
@@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
  {
         unsigned long index;
         unsigned long end;
-       unsigned long badbit;
         unsigned long flags;
  
         index = start_addr >> PAGE_SHIFT;
@@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
  
         spin_lock_irqsave(&tbl->it_lock, flags);
  
-       badbit = verify_bit_range(tbl->it_map, 0, index, end);
-       if (badbit != ~0UL) {
-               if (printk_ratelimit())
-                       printk(KERN_ERR "Calgary: entry already allocated at "
-                              "0x%lx tbl %p dma 0x%lx npages %u\n",
-                              badbit, tbl, start_addr, npages);
-       }
-
         iommu_area_reserve(tbl->it_map, index, npages);
  
         spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
         unsigned int npages)
  {
         unsigned long entry;
-       unsigned long badbit;
         unsigned long badend;
         unsigned long flags;
  
@@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
  
         spin_lock_irqsave(&tbl->it_lock, flags);
  
-       badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
-       if (badbit != ~0UL) {
-               if (printk_ratelimit())
-                       printk(KERN_ERR "Calgary: bit is off at 0x%lx "
-                              "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
-                              badbit, tbl, dma_addr, entry, npages);
-       }
-
         iommu_area_free(tbl->it_map, entry, npages);
  
         spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -1488,9 +1439,8 @@ void __init detect_calgary(void)
                 iommu_detected = 1;
                 calgary_detected = 1;
                 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
-               printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
-                      "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
-                      debugging ? "enabled" : "disabled");
+               printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
+                      specified_table_size);
  
                 /* swiotlb for devices that aren't behind the Calgary. */
                 if (max_pfn > MAX_DMA32_PFN)
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c

index b284b58c035ccdd8fc850604cced3557ae7de2c5..cfd9f90638967e03277ad510625905213322bfd4 100644 (file)
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -144,48 +144,21 @@ static void flush_gart(void)
  }
  
  #ifdef CONFIG_IOMMU_LEAK
-
-#define SET_LEAK(x)                                                    \
-       do {                                                            \
-               if (iommu_leak_tab)                                     \
-                       iommu_leak_tab[x] = __builtin_return_address(0);\
-       } while (0)
-
-#define CLEAR_LEAK(x)                                                  \
-       do {                                                            \
-               if (iommu_leak_tab)                                     \
-                       iommu_leak_tab[x] = NULL;                       \
-       } while (0)
-
  /* Debugging aid for drivers that don't free their IOMMU tables */
-static void **iommu_leak_tab;
  static int leak_trace;
  static int iommu_leak_pages = 20;
  
  static void dump_leak(void)
  {
-       int i;
         static int dump;
  
-       if (dump || !iommu_leak_tab)
+       if (dump)
                 return;
         dump = 1;
-       show_stack(NULL, NULL);
  
-       /* Very crude. dump some from the end of the table too */
-       printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n",
-              iommu_leak_pages);
-       for (i = 0; i < iommu_leak_pages; i += 2) {
-               printk(KERN_DEBUG "%lu: ", iommu_pages-i);
-               printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
-                               0);
-               printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
-       }
-       printk(KERN_DEBUG "\n");
+       show_stack(NULL, NULL);
+       debug_dma_dump_mappings(NULL);
  }
-#else
-# define SET_LEAK(x)
-# define CLEAR_LEAK(x)
  #endif
  
  static void iommu_full(struct device *dev, size_t size, int dir)
@@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
  
         for (i = 0; i < npages; i++) {
                 iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
-               SET_LEAK(iommu_page + i);
                 phys_mem += PAGE_SIZE;
         }
         return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
@@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
         npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
         for (i = 0; i < npages; i++) {
                 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
-               CLEAR_LEAK(iommu_page + i);
         }
         free_iommu(iommu_page, npages);
  }
@@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
                 pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
                 while (pages--) {
                         iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
-                       SET_LEAK(iommu_page);
                         addr += PAGE_SIZE;
                         iommu_page++;
                 }
@@ -688,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
  
         agp_gatt_table = gatt;
  
-       enable_gart_translations();
-
         error = sysdev_class_register(&gart_sysdev_class);
         if (!error)
                 error = sysdev_register(&device_gart);
@@ -801,11 +769,12 @@ void __init gart_iommu_init(void)
  
  #ifdef CONFIG_IOMMU_LEAK
         if (leak_trace) {
-               iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
-                                 get_order(iommu_pages*sizeof(void *)));
-               if (!iommu_leak_tab)
+               int ret;
+
+               ret = dma_debug_resize_entries(iommu_pages);
+               if (ret)
                         printk(KERN_DEBUG
-                              "PCI-DMA: Cannot allocate leak trace area\n");
+                              "PCI-DMA: Cannot trace all the entries\n");
         }
  #endif
  
@@ -845,6 +814,14 @@ void __init gart_iommu_init(void)
          * the pages as Not-Present:
          */
         wbinvd();
+       
+       /*
+        * Now all caches are flushed and we can safely enable
+        * GART hardware.  Doing it early leaves the possibility
+        * of stale cache entries that can lead to GART PTE
+        * errors.
+        */
+       enable_gart_translations();
  
         /*
          * Try to workaround a bug (thanks to BenH):
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c

index 221a3853e2684b111c36669ee688f62ba51cdb10..a1712f2b50f1b974a1a346bd4d8db1ec21f02068 100644 (file)
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
         return paddr;
  }
  
-phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
+phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
  {
         return baddr;
  }
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c

index ca989158e847936e893c7110b4a328ea1549d452..3bb2be1649bddb5b3ba9870553b72ce73e41b460 100644 (file)
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -8,12 +8,15 @@
  #include <linux/module.h>
  #include <linux/pm.h>
  #include <linux/clockchips.h>
+#include <linux/random.h>
  #include <trace/power.h>
  #include <asm/system.h>
  #include <asm/apic.h>
+#include <asm/syscalls.h>
  #include <asm/idle.h>
  #include <asm/uaccess.h>
  #include <asm/i387.h>
+#include <asm/ds.h>
  
  unsigned long idle_halt;
  EXPORT_SYMBOL(idle_halt);
@@ -45,6 +48,8 @@ void free_thread_xstate(struct task_struct *tsk)
                 kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
                 tsk->thread.xstate = NULL;
         }
+
+       WARN(tsk->thread.ds_ctx, "leaking DS context\n");
  }
  
  void free_thread_info(struct thread_info *ti)
@@ -83,8 +88,6 @@ void exit_thread(void)
                 put_cpu();
                 kfree(bp);
         }
-
-       ds_exit_thread(current);
  }
  
  void flush_thread(void)
@@ -613,3 +616,16 @@ static int __init idle_setup(char *str)
  }
  early_param("idle", idle_setup);
  
+unsigned long arch_align_stack(unsigned long sp)
+{
+       if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+               sp -= get_random_int() % 8192;
+       return sp & ~0xf;
+}
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+       unsigned long range_end = mm->brk + 0x02000000;
+       return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
+
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c

index 76f8f84043a2a4693123648d92d08c22ca7f5f25..59f4524984afacd10b456dcdca4abbc2237d7ecb 100644 (file)
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -9,8 +9,6 @@
   * This file handles the architecture-dependent parts of process handling..
   */
  
-#include <stdarg.h>
-
  #include <linux/stackprotector.h>
  #include <linux/cpu.h>
  #include <linux/errno.h>
@@ -33,7 +31,6 @@
  #include <linux/module.h>
  #include <linux/kallsyms.h>
  #include <linux/ptrace.h>
-#include <linux/random.h>
  #include <linux/personality.h>
  #include <linux/tick.h>
  #include <linux/percpu.h>
@@ -290,7 +287,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
                 p->thread.io_bitmap_max = 0;
         }
  
-       ds_copy_thread(p, current);
+       clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
+       p->thread.ds_ctx = NULL;
  
         clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
         p->thread.debugctlmsr = 0;
@@ -407,7 +405,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
          * done before math_state_restore, so the TS bit is up
          * to date.
          */
-       arch_leave_lazy_cpu_mode();
+       arch_end_context_switch(next_p);
  
         /* If the task has used fpu the last 5 timeslices, just do a full
          * restore of the math state immediately to avoid the trap; the
@@ -497,15 +495,3 @@ unsigned long get_wchan(struct task_struct *p)
         return 0;
  }
  
-unsigned long arch_align_stack(unsigned long sp)
-{
-       if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
-               sp -= get_random_int() % 8192;
-       return sp & ~0xf;
-}
-
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
-       unsigned long range_end = mm->brk + 0x02000000;
-       return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
-}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c

index b751a41392b1b997d3c9a3235ba85b2372bdf7a3..ebefb5407b9d36f5ccbb1c458708fb8a647ff19b 100644 (file)
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -14,8 +14,6 @@
   * This file handles the architecture-dependent parts of process handling..
   */
  
-#include <stdarg.h>
-
  #include <linux/stackprotector.h>
  #include <linux/cpu.h>
  #include <linux/errno.h>
@@ -32,7 +30,6 @@
  #include <linux/delay.h>
  #include <linux/module.h>
  #include <linux/ptrace.h>
-#include <linux/random.h>
  #include <linux/notifier.h>
  #include <linux/kprobes.h>
  #include <linux/kdebug.h>
@@ -335,7 +332,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
                         goto out;
         }
  
-       ds_copy_thread(p, me);
+       clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
+       p->thread.ds_ctx = NULL;
  
         clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
         p->thread.debugctlmsr = 0;
@@ -428,7 +426,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
          * done before math_state_restore, so the TS bit is up
          * to date.
          */
-       arch_leave_lazy_cpu_mode();
+       arch_end_context_switch(next_p);
  
         /*
          * Switch FS and GS.
@@ -660,15 +658,3 @@ long sys_arch_prctl(int code, unsigned long addr)
         return do_arch_prctl(current, code, addr);
  }
  
-unsigned long arch_align_stack(unsigned long sp)
-{
-       if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
-               sp -= get_random_int() % 8192;
-       return sp & ~0xf;
-}
-
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
-       unsigned long range_end = mm->brk + 0x02000000;
-       return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
-}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c

index 23b7c8f017e2afa74194c81c7720724a2604751b..09ecbde91c1354e036751f71e2d5612057fe3626 100644 (file)
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
  #include <linux/audit.h>
  #include <linux/seccomp.h>
  #include <linux/signal.h>
+#include <linux/workqueue.h>
  
  #include <asm/uaccess.h>
  #include <asm/pgtable.h>
@@ -578,17 +579,130 @@ static int ioperm_get(struct task_struct *target,
  }
  
  #ifdef CONFIG_X86_PTRACE_BTS
+/*
+ * A branch trace store context.
+ *
+ * Contexts may only be installed by ptrace_bts_config() and only for
+ * ptraced tasks.
+ *
+ * Contexts are destroyed when the tracee is detached from the tracer.
+ * The actual destruction work requires interrupts enabled, so the
+ * work is deferred and will be scheduled during __ptrace_unlink().
+ *
+ * Contexts hold an additional task_struct reference on the traced
+ * task, as well as a reference on the tracer's mm.
+ *
+ * Ptrace already holds a task_struct for the duration of ptrace operations,
+ * but since destruction is deferred, it may be executed after both
+ * tracer and tracee exited.
+ */
+struct bts_context {
+       /* The branch trace handle. */
+       struct bts_tracer       *tracer;
+
+       /* The buffer used to store the branch trace and its size. */
+       void                    *buffer;
+       unsigned int            size;
+
+       /* The mm that paid for the above buffer. */
+       struct mm_struct        *mm;
+
+       /* The task this context belongs to. */
+       struct task_struct      *task;
+
+       /* The signal to send on a bts buffer overflow. */
+       unsigned int            bts_ovfl_signal;
+
+       /* The work struct to destroy a context. */
+       struct work_struct      work;
+};
+
+static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
+{
+       void *buffer = NULL;
+       int err = -ENOMEM;
+
+       err = account_locked_memory(current->mm, current->signal->rlim, size);
+       if (err < 0)
+               return err;
+
+       buffer = kzalloc(size, GFP_KERNEL);
+       if (!buffer)
+               goto out_refund;
+
+       context->buffer = buffer;
+       context->size = size;
+       context->mm = get_task_mm(current);
+
+       return 0;
+
+ out_refund:
+       refund_locked_memory(current->mm, size);
+       return err;
+}
+
+static inline void free_bts_buffer(struct bts_context *context)
+{
+       if (!context->buffer)
+               return;
+
+       kfree(context->buffer);
+       context->buffer = NULL;
+
+       refund_locked_memory(context->mm, context->size);
+       context->size = 0;
+
+       mmput(context->mm);
+       context->mm = NULL;
+}
+
+static void free_bts_context_work(struct work_struct *w)
+{
+       struct bts_context *context;
+
+       context = container_of(w, struct bts_context, work);
+
+       ds_release_bts(context->tracer);
+       put_task_struct(context->task);
+       free_bts_buffer(context);
+       kfree(context);
+}
+
+static inline void free_bts_context(struct bts_context *context)
+{
+       INIT_WORK(&context->work, free_bts_context_work);
+       schedule_work(&context->work);
+}
+
+static inline struct bts_context *alloc_bts_context(struct task_struct *task)
+{
+       struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
+       if (context) {
+               context->task = task;
+               task->bts = context;
+
+               get_task_struct(task);
+       }
+
+       return context;
+}
+
  static int ptrace_bts_read_record(struct task_struct *child, size_t index,
                                   struct bts_struct __user *out)
  {
+       struct bts_context *context;
         const struct bts_trace *trace;
         struct bts_struct bts;
         const unsigned char *at;
         int error;
  
-       trace = ds_read_bts(child->bts);
+       context = child->bts;
+       if (!context)
+               return -ESRCH;
+
+       trace = ds_read_bts(context->tracer);
         if (!trace)
-               return -EPERM;
+               return -ESRCH;
  
         at = trace->ds.top - ((index + 1) * trace->ds.size);
         if ((void *)at < trace->ds.begin)
@@ -597,7 +711,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
         if (!trace->read)
                 return -EOPNOTSUPP;
  
-       error = trace->read(child->bts, at, &bts);
+       error = trace->read(context->tracer, at, &bts);
         if (error < 0)
                 return error;
  
@@ -611,13 +725,18 @@ static int ptrace_bts_drain(struct task_struct *child,
                             long size,
                             struct bts_struct __user *out)
  {
+       struct bts_context *context;
         const struct bts_trace *trace;
         const unsigned char *at;
         int error, drained = 0;
  
-       trace = ds_read_bts(child->bts);
+       context = child->bts;
+       if (!context)
+               return -ESRCH;
+
+       trace = ds_read_bts(context->tracer);
         if (!trace)
-               return -EPERM;
+               return -ESRCH;
  
         if (!trace->read)
                 return -EOPNOTSUPP;
@@ -628,9 +747,8 @@ static int ptrace_bts_drain(struct task_struct *child,
         for (at = trace->ds.begin; (void *)at < trace->ds.top;
              out++, drained++, at += trace->ds.size) {
                 struct bts_struct bts;
-               int error;
  
-               error = trace->read(child->bts, at, &bts);
+               error = trace->read(context->tracer, at, &bts);
                 if (error < 0)
                         return error;
  
@@ -640,35 +758,18 @@ static int ptrace_bts_drain(struct task_struct *child,
  
         memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
  
-       error = ds_reset_bts(child->bts);
+       error = ds_reset_bts(context->tracer);
         if (error < 0)
                 return error;
  
         return drained;
  }
  
-static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
-{
-       child->bts_buffer = alloc_locked_buffer(size);
-       if (!child->bts_buffer)
-               return -ENOMEM;
-
-       child->bts_size = size;
-
-       return 0;
-}
-
-static void ptrace_bts_free_buffer(struct task_struct *child)
-{
-       free_locked_buffer(child->bts_buffer, child->bts_size);
-       child->bts_buffer = NULL;
-       child->bts_size = 0;
-}
-
  static int ptrace_bts_config(struct task_struct *child,
                              long cfg_size,
                              const struct ptrace_bts_config __user *ucfg)
  {
+       struct bts_context *context;
         struct ptrace_bts_config cfg;
         unsigned int flags = 0;
  
@@ -678,28 +779,33 @@ static int ptrace_bts_config(struct task_struct *child,
         if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
                 return -EFAULT;
  
-       if (child->bts) {
-               ds_release_bts(child->bts);
-               child->bts = NULL;
-       }
+       context = child->bts;
+       if (!context)
+               context = alloc_bts_context(child);
+       if (!context)
+               return -ENOMEM;
  
         if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
                 if (!cfg.signal)
                         return -EINVAL;
  
-               child->thread.bts_ovfl_signal = cfg.signal;
                 return -EOPNOTSUPP;
+               context->bts_ovfl_signal = cfg.signal;
         }
  
-       if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
-           (cfg.size != child->bts_size)) {
-               int error;
+       ds_release_bts(context->tracer);
+       context->tracer = NULL;
  
-               ptrace_bts_free_buffer(child);
+       if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
+               int err;
  
-               error = ptrace_bts_allocate_buffer(child, cfg.size);
-               if (error < 0)
-                       return error;
+               free_bts_buffer(context);
+               if (!cfg.size)
+                       return 0;
+
+               err = alloc_bts_buffer(context, cfg.size);
+               if (err < 0)
+                       return err;
         }
  
         if (cfg.flags & PTRACE_BTS_O_TRACE)
@@ -708,15 +814,14 @@ static int ptrace_bts_config(struct task_struct *child,
         if (cfg.flags & PTRACE_BTS_O_SCHED)
                 flags |= BTS_TIMESTAMPS;
  
-       child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
-                                   /* ovfl = */ NULL, /* th = */ (size_t)-1,
-                                   flags);
-       if (IS_ERR(child->bts)) {
-               int error = PTR_ERR(child->bts);
-
-               ptrace_bts_free_buffer(child);
-               child->bts = NULL;
+       context->tracer =
+               ds_request_bts_task(child, context->buffer, context->size,
+                                   NULL, (size_t)-1, flags);
+       if (unlikely(IS_ERR(context->tracer))) {
+               int error = PTR_ERR(context->tracer);
  
+               free_bts_buffer(context);
+               context->tracer = NULL;
                 return error;
         }
  
@@ -727,20 +832,25 @@ static int ptrace_bts_status(struct task_struct *child,
                              long cfg_size,
                              struct ptrace_bts_config __user *ucfg)
  {
+       struct bts_context *context;
         const struct bts_trace *trace;
         struct ptrace_bts_config cfg;
  
+       context = child->bts;
+       if (!context)
+               return -ESRCH;
+
         if (cfg_size < sizeof(cfg))
                 return -EIO;
  
-       trace = ds_read_bts(child->bts);
+       trace = ds_read_bts(context->tracer);
         if (!trace)
-               return -EPERM;
+               return -ESRCH;
  
         memset(&cfg, 0, sizeof(cfg));
-       cfg.size = trace->ds.end - trace->ds.begin;
-       cfg.signal = child->thread.bts_ovfl_signal;
-       cfg.bts_size = sizeof(struct bts_struct);
+       cfg.size        = trace->ds.end - trace->ds.begin;
+       cfg.signal      = context->bts_ovfl_signal;
+       cfg.bts_size    = sizeof(struct bts_struct);
  
         if (cfg.signal)
                 cfg.flags |= PTRACE_BTS_O_SIGNAL;
@@ -759,80 +869,51 @@ static int ptrace_bts_status(struct task_struct *child,
  
  static int ptrace_bts_clear(struct task_struct *child)
  {
+       struct bts_context *context;
         const struct bts_trace *trace;
  
-       trace = ds_read_bts(child->bts);
+       context = child->bts;
+       if (!context)
+               return -ESRCH;
+
+       trace = ds_read_bts(context->tracer);
         if (!trace)
-               return -EPERM;
+               return -ESRCH;
  
         memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
  
-       return ds_reset_bts(child->bts);
+       return ds_reset_bts(context->tracer);
  }
  
  static int ptrace_bts_size(struct task_struct *child)
  {
+       struct bts_context *context;
         const struct bts_trace *trace;
  
-       trace = ds_read_bts(child->bts);
+       context = child->bts;
+       if (!context)
+               return -ESRCH;
+
+       trace = ds_read_bts(context->tracer);
         if (!trace)
-               return -EPERM;
+               return -ESRCH;
  
         return (trace->ds.top - trace->ds.begin) / trace->ds.size;
  }
  
-static void ptrace_bts_fork(struct task_struct *tsk)
-{
-       tsk->bts = NULL;
-       tsk->bts_buffer = NULL;
-       tsk->bts_size = 0;
-       tsk->thread.bts_ovfl_signal = 0;
-}
-
-static void ptrace_bts_untrace(struct task_struct *child)
+/*
+ * Called from __ptrace_unlink() after the child has been moved back
+ * to its original parent.
+ */
+void ptrace_bts_untrace(struct task_struct *child)
  {
         if (unlikely(child->bts)) {
-               ds_release_bts(child->bts);
+               free_bts_context(child->bts);
                 child->bts = NULL;
-
-               /* We cannot update total_vm and locked_vm since
-                  child's mm is already gone. But we can reclaim the
-                  memory. */
-               kfree(child->bts_buffer);
-               child->bts_buffer = NULL;
-               child->bts_size = 0;
         }
  }
-
-static void ptrace_bts_detach(struct task_struct *child)
-{
-       /*
-        * Ptrace_detach() races with ptrace_untrace() in case
-        * the child dies and is reaped by another thread.
-        *
-        * We only do the memory accounting at this point and
-        * leave the buffer deallocation and the bts tracer
-        * release to ptrace_bts_untrace() which will be called
-        * later on with tasklist_lock held.
-        */
-       release_locked_buffer(child->bts_buffer, child->bts_size);
-}
-#else
-static inline void ptrace_bts_fork(struct task_struct *tsk) {}
-static inline void ptrace_bts_detach(struct task_struct *child) {}
-static inline void ptrace_bts_untrace(struct task_struct *child) {}
  #endif /* CONFIG_X86_PTRACE_BTS */
  
-void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
-{
-       ptrace_bts_fork(child);
-}
-
-void x86_ptrace_untrace(struct task_struct *child)
-{
-       ptrace_bts_untrace(child);
-}
-
  /*
   * Called by kernel/ptrace.c when detaching..
   *
@@ -844,7 +925,6 @@ void ptrace_disable(struct task_struct *child)
  #ifdef TIF_SYSCALL_EMU
         clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
  #endif
-       ptrace_bts_detach(child);
  }
  
  #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c

index 7563b31b4f0349f45bf324053a67ecb506a4fced..af71d06624bf5b70b5173ba29238c40b6c21c801 100644 (file)
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -491,5 +491,42 @@ void force_hpet_resume(void)
                 break;
         }
  }
+#endif
+
+#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
+/* Set correct numa_node information for AMD NB functions */
+static void __init quirk_amd_nb_node(struct pci_dev *dev)
+{
+       struct pci_dev *nb_ht;
+       unsigned int devfn;
+       u32 val;
+
+       devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
+       nb_ht = pci_get_slot(dev->bus, devfn);
+       if (!nb_ht)
+               return;
+
+       pci_read_config_dword(nb_ht, 0x60, &val);
+       set_dev_node(&dev->dev, val & 7);
+       pci_dev_put(dev);
+}
  
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
+                       quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP,
+                       quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL,
+                       quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC,
+                       quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT,
+                       quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP,
+                       quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM,
+                       quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
+                       quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
+                       quirk_amd_nb_node);
  #endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c

index 667188e0b5a0bfac320d69a0f734dda03b5bee88..d2d1ce8170f06c8573ac022f4e49dfc59a371c9b 100644 (file)
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
                         DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
                 },
         },
+       {   /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
+               .callback = set_bios_reboot,
+               .ident = "Dell OptiPlex 360",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
+                       DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
+               },
+       },
         {       /* Handle problems with rebooting on Dell 2400's */
                 .callback = set_bios_reboot,
                 .ident = "Dell PowerEdge 2400",
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c

index b4158439bf634d254852cceab2c30d26f943f7ea..d1c636bf31a71018d73f7e36616d2c7d533707ea 100644 (file)
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,14 @@
  #define ARCH_SETUP
  #endif
  
+/*
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */
+unsigned long max_low_pfn_mapped;
+unsigned long max_pfn_mapped;
+
  RESERVE_BRK(dmi_alloc, 65536);
  
  unsigned int boot_cpu_id __read_mostly;
@@ -214,8 +222,8 @@ unsigned long mmu_cr4_features;
  unsigned long mmu_cr4_features = X86_CR4_PAE;
  #endif
  
-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
-int bootloader_type;
+/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
+int bootloader_type, bootloader_version;
  
  /*
   * Setup options
@@ -706,6 +714,12 @@ void __init setup_arch(char **cmdline_p)
  #endif
         saved_video_mode = boot_params.hdr.vid_mode;
         bootloader_type = boot_params.hdr.type_of_loader;
+       if ((bootloader_type >> 4) == 0xe) {
+               bootloader_type &= 0xf;
+               bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
+       }
+       bootloader_version  = bootloader_type & 0xf;
+       bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
  
  #ifdef CONFIG_BLK_DEV_RAM
         rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
@@ -854,12 +868,16 @@ void __init setup_arch(char **cmdline_p)
                 max_low_pfn = max_pfn;
  
         high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+       max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
  #endif
  
  #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
         setup_bios_corruption_check();
  #endif
  
+       printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
+                       max_pfn_mapped<<PAGE_SHIFT);
+
         reserve_brk();
  
         /* max_pfn_mapped is updated here */
@@ -996,24 +1014,6 @@ void __init setup_arch(char **cmdline_p)
  
  #ifdef CONFIG_X86_32
  
-/**
- * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
- *
- * Description:
- *     Perform any necessary interrupt initialisation prior to setting up
- *     the "ordinary" interrupt call gates.  For legacy reasons, the ISA
- *     interrupts should be initialised here if the machine emulates a PC
- *     in any way.
- **/
-void __init x86_quirk_pre_intr_init(void)
-{
-       if (x86_quirks->arch_pre_intr_init) {
-               if (x86_quirks->arch_pre_intr_init())
-                       return;
-       }
-       init_ISA_irqs();
-}
-
  /**
   * x86_quirk_intr_init - post gate setup interrupt initialisation
   *
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c

index 8f0e13be36b31d1a80c94dec2692504d2a9a86db..9c3f0823e6aa00ea93bec0babdfd2c0b19971d13 100644 (file)
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -425,6 +425,14 @@ void __init setup_per_cpu_areas(void)
         early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
  #endif
  
+#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
+       /*
+        * make sure boot cpu node_number is right, when boot cpu is on the
+        * node that doesn't have mem installed
+        */
+       per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
+#endif
+
         /* Setup node to cpumask map */
         setup_node_to_cpumask_map();
  
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c

index 14425166b8e3f4838073ce6052aef507aa62510a..0a813b17b172437760b6d8beae71104f6fc878ba 100644 (file)
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
   *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
   *  2000-2002   x86-64 support by Andi Kleen
   */
-
  #include <linux/sched.h>
  #include <linux/mm.h>
  #include <linux/smp.h>
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c

index 13f33ea8ccaaf1ae8e23cc2a39f57d84d6d3c910..f6db48c405b81fca925988fa0fc408f078793722 100644 (file)
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -193,19 +193,19 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
  }
  
  struct smp_ops smp_ops = {
-       .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
-       .smp_prepare_cpus = native_smp_prepare_cpus,
-       .smp_cpus_done = native_smp_cpus_done,
+       .smp_prepare_boot_cpu   = native_smp_prepare_boot_cpu,
+       .smp_prepare_cpus       = native_smp_prepare_cpus,
+       .smp_cpus_done          = native_smp_cpus_done,
  
-       .smp_send_stop = native_smp_send_stop,
-       .smp_send_reschedule = native_smp_send_reschedule,
+       .smp_send_stop          = native_smp_send_stop,
+       .smp_send_reschedule    = native_smp_send_reschedule,
  
-       .cpu_up = native_cpu_up,
-       .cpu_die = native_cpu_die,
-       .cpu_disable = native_cpu_disable,
-       .play_dead = native_play_dead,
+       .cpu_up                 = native_cpu_up,
+       .cpu_die                = native_cpu_die,
+       .cpu_disable            = native_cpu_disable,
+       .play_dead              = native_play_dead,
  
-       .send_call_func_ipi = native_send_call_func_ipi,
+       .send_call_func_ipi     = native_send_call_func_ipi,
         .send_call_func_single_ipi = native_send_call_func_single_ipi,
  };
  EXPORT_SYMBOL_GPL(smp_ops);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c

index 58d24ef917d8b46e7c7faedc4dd3fc79266ad0cb..7c80007ea5f7fb28ce9e61c4569f3c3432981b1d 100644 (file)
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -504,7 +504,7 @@ void __inquire_remote_apic(int apicid)
   * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
   * won't ... remember to clear down the APIC, etc later.
   */
-int __devinit
+int __cpuinit
  wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
  {
         unsigned long send_status, accept_status = 0;
@@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
         return (send_status | accept_status);
  }
  
-int __devinit
+static int __cpuinit
  wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
  {
         unsigned long send_status, accept_status = 0;
@@ -822,10 +822,12 @@ do_rest:
         /* mark "stuck" area as not stuck */
         *((volatile unsigned long *)trampoline_base) = 0;
  
-       /*
-        * Cleanup possible dangling ends...
-        */
-       smpboot_restore_warm_reset_vector();
+       if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
+               /*
+                * Cleanup possible dangling ends...
+                */
+               smpboot_restore_warm_reset_vector();
+       }
  
         return boot_error;
  }
@@ -990,10 +992,12 @@ static int __init smp_sanity_check(unsigned max_cpus)
          */
         if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
             !cpu_has_apic) {
-               printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
-                       boot_cpu_physical_apicid);
-               printk(KERN_ERR "... forcing use of dummy APIC emulation."
+               if (!disable_apic) {
+                       pr_err("BIOS bug, local APIC #%d not detected!...\n",
+                               boot_cpu_physical_apicid);
+                       pr_err("... forcing use of dummy APIC emulation."
                                 "(tell your hw vendor)\n");
+               }
                 smpboot_clear_io_apic();
                 arch_disable_smp_support();
                 return -1;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c

index f7bddc2e37d1bbf19a86f03b9e3a3a0da70ab199..4aaf7e48394fb562343f27d811e7a9f329be2b76 100644 (file)
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -20,7 +20,7 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
  
  static int save_stack_stack(void *data, char *name)
  {
-       return -1;
+       return 0;
  }
  
  static void save_stack_address(void *data, unsigned long addr, int reliable)
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S

index ff5c8736b491b8ff2c4835c5a19a51ad63633de1..d51321ddafda88885ddbbd03f9c1ef5a482037d9 100644 (file)
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -334,3 +334,5 @@ ENTRY(sys_call_table)
         .long sys_inotify_init1
         .long sys_preadv
         .long sys_pwritev
+       .long sys_rt_tgsigqueueinfo     /* 335 */
+       .long sys_perf_counter_open
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c

index 8c7b03b0cfcb47b1585187e95a67890137218c5d..124d40c575df376e989d5b00229fda865f0bb682 100644 (file)
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -715,7 +715,12 @@ uv_activation_descriptor_init(int node, int pnode)
         struct bau_desc *adp;
         struct bau_desc *ad2;
  
-       adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node);
+       /*
+        * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
+        * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
+        */
+       adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
+               UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
         BUG_ON(!adp);
  
         pa = uv_gpa(adp); /* need the real nasid*/
@@ -729,7 +734,13 @@ uv_activation_descriptor_init(int node, int pnode)
                                       (n << UV_DESC_BASE_PNODE_SHIFT | m));
         }
  
-       for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
+       /*
+        * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
+        * cpu even though we only use the first one; one descriptor can
+        * describe a broadcast to 256 nodes.
+        */
+       for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
+               i++, ad2++) {
                 memset(ad2, 0, sizeof(struct bau_desc));
                 ad2->header.sw_ack_flag = 1;
                 /*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c

index a1d288327ff0ff0f152241bf296c742778f61e00..07d60c870ce20e37df710bb7bbb883e911fb043d 100644 (file)
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -839,9 +839,6 @@ asmlinkage void math_state_restore(void)
         }
  
         clts();                         /* Allow maths ops (or we recurse) */
-#ifdef CONFIG_X86_32
-       restore_fpu(tsk);
-#else
         /*
          * Paranoid restore. send a SIGSEGV if we fail to restore the state.
          */
@@ -850,7 +847,7 @@ asmlinkage void math_state_restore(void)
                 force_sig(SIGSEGV, tsk);
                 return;
         }
-#endif
+
         thread->status |= TS_USEDFPU;   /* So we fnsave on switch_to() */
         tsk->fpu_counter++;
  }
@@ -945,8 +942,13 @@ void __init trap_init(void)
  #endif
         set_intr_gate(19, &simd_coprocessor_error);
  
+       /* Reserve all the builtin and the syscall vector: */
+       for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
+               set_bit(i, used_vectors);
+
  #ifdef CONFIG_IA32_EMULATION
         set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+       set_bit(IA32_SYSCALL_VECTOR, used_vectors);
  #endif
  
  #ifdef CONFIG_X86_32
@@ -963,17 +965,9 @@ void __init trap_init(void)
         }
  
         set_system_trap_gate(SYSCALL_VECTOR, &system_call);
-#endif
-
-       /* Reserve all the builtin and the syscall vector: */
-       for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
-               set_bit(i, used_vectors);
-
-#ifdef CONFIG_X86_64
-       set_bit(IA32_SYSCALL_VECTOR, used_vectors);
-#else
         set_bit(SYSCALL_VECTOR, used_vectors);
  #endif
+
         /*
          * Should be a barrier for any external CPU state:
          */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c

index d57de05dc43093e0831ebb544a2baddf3d72f184..3e1c057e98fe039325eed2056d8960fa52f023c4 100644 (file)
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -384,13 +384,13 @@ unsigned long native_calibrate_tsc(void)
  {
         u64 tsc1, tsc2, delta, ref1, ref2;
         unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
-       unsigned long flags, latch, ms, fast_calibrate, tsc_khz;
+       unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz;
         int hpet = is_hpet_enabled(), i, loopmin;
  
-       tsc_khz = get_hypervisor_tsc_freq();
-       if (tsc_khz) {
+       hv_tsc_khz = get_hypervisor_tsc_freq();
+       if (hv_tsc_khz) {
                 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
-               return tsc_khz;
+               return hv_tsc_khz;
         }
  
         local_irq_save(flags);
@@ -710,7 +710,16 @@ static cycle_t read_tsc(struct clocksource *cs)
  #ifdef CONFIG_X86_64
  static cycle_t __vsyscall_fn vread_tsc(void)
  {
-       cycle_t ret = (cycle_t)vget_cycles();
+       cycle_t ret;
+
+       /*
+        * Surround the RDTSC by barriers, to make sure it's not
+        * speculated to outside the seqlock critical section and
+        * does not cause time warps:
+        */
+       rdtsc_barrier();
+       ret = (cycle_t)vget_cycles();
+       rdtsc_barrier();
  
         return ret >= __vsyscall_gtod_data.clock.cycle_last ?
                 ret : __vsyscall_gtod_data.clock.cycle_last;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c

index bf36328f6ef9289c4b1a1a7d0ffd9c67878f3f32..027b5b498993b6f0606367e2803f6fc9f15dca2c 100644 (file)
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count;
   * of a critical section, to be able to prove TSC time-warps:
   */
  static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
+
  static __cpuinitdata cycles_t last_tsc;
  static __cpuinitdata cycles_t max_warp;
  static __cpuinitdata int nr_warps;
@@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
                 return;
  
         if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
-               printk(KERN_INFO
-                      "Skipping synchronization checks as TSC is reliable.\n");
+               pr_info("Skipping synchronization checks as TSC is reliable.\n");
                 return;
         }
  
-       printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
-                         smp_processor_id(), cpu);
+       pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
+               smp_processor_id(), cpu);
  
         /*
          * Reset it - in case this is a second bootup:
@@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu)
  
         if (nr_warps) {
                 printk("\n");
-               printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
-                                   " turning off TSC clock.\n", max_warp);
+               pr_warning("Measured %Ld cycles TSC warp between CPUs, "
+                          "turning off TSC clock.\n", max_warp);
                 mark_tsc_unstable("check_tsc_sync_source failed");
         } else {
                 printk(" passed.\n");
@@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void)
         while (atomic_read(&stop_count) != cpus)
                 cpu_relax();
  }
-#undef NR_LOOPS
-
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c

index d7ac84e7fc1c73f7cfcfd8ca6b66c5abed2219ce..9c4e625390589ac69e984edded55d13d6200a031 100644 (file)
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
         info->regs.pt.ds = 0;
         info->regs.pt.es = 0;
         info->regs.pt.fs = 0;
-
-/* we are clearing gs later just before "jmp resume_userspace",
- * because it is not saved/restored.
- */
+#ifndef CONFIG_X86_32_LAZY_GS
+       info->regs.pt.gs = 0;
+#endif
  
  /*
   * The flags register is also special: we cannot trust that the user
@@ -318,9 +317,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
         }
  
  /*
- * Save old state, set default return value (%ax) to 0
+ * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
   */
-       info->regs32->ax = 0;
+       info->regs32->ax = VM86_SIGNAL;
         tsk->thread.saved_sp0 = tsk->thread.sp0;
         tsk->thread.saved_fs = info->regs32->fs;
         tsk->thread.saved_gs = get_user_gs(info->regs32);
@@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
         __asm__ __volatile__(
                 "movl %0,%%esp\n\t"
                 "movl %1,%%ebp\n\t"
+#ifdef CONFIG_X86_32_LAZY_GS
                 "mov  %2, %%gs\n\t"
+#endif
                 "jmp resume_userspace"
                 : /* no outputs */
                 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c

index 95deb9f2211e98decb5572e4757bc2cc48f3da18..b263423fbe2ae971424c5bd99c112ac984413383 100644 (file)
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -462,22 +462,28 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
  }
  #endif
  
-static void vmi_enter_lazy_cpu(void)
+static void vmi_start_context_switch(struct task_struct *prev)
  {
-       paravirt_enter_lazy_cpu();
+       paravirt_start_context_switch(prev);
         vmi_ops.set_lazy_mode(2);
  }
  
+static void vmi_end_context_switch(struct task_struct *next)
+{
+       vmi_ops.set_lazy_mode(0);
+       paravirt_end_context_switch(next);
+}
+
  static void vmi_enter_lazy_mmu(void)
  {
         paravirt_enter_lazy_mmu();
         vmi_ops.set_lazy_mode(1);
  }
  
-static void vmi_leave_lazy(void)
+static void vmi_leave_lazy_mmu(void)
  {
-       paravirt_leave_lazy(paravirt_get_lazy_mode());
         vmi_ops.set_lazy_mode(0);
+       paravirt_leave_lazy_mmu();
  }
  
  static inline int __init check_vmi_rom(struct vrom_header *rom)
@@ -711,14 +717,14 @@ static inline int __init activate_vmi(void)
         para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
         para_fill(pv_cpu_ops.io_delay, IODelay);
  
-       para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu,
+       para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
                   set_lazy_mode, SetLazyMode);
-       para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy,
+       para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
                   set_lazy_mode, SetLazyMode);
  
         para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
                   set_lazy_mode, SetLazyMode);
-       para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy,
+       para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
                   set_lazy_mode, SetLazyMode);
  
         /* user and kernel flush are just handled with different flags to FlushTLB */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S

index 849ee611f01388684ff9102838bef8a4fe38464b..4c85b2e2bb652873da1ee5367549e070b363bf8a 100644 (file)
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -1,5 +1,431 @@
+/*
+ * ld script for the x86 kernel
+ *
+ * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * Modernisation, unification and other changes and fixes:
+ *   Copyright (C) 2007-2009  Sam Ravnborg <sam@ravnborg.org>
+ *
+ *
+ * Don't define absolute symbols until and unless you know that symbol
+ * value is should remain constant even if kernel image is relocated
+ * at run time. Absolute symbols are not relocated. If symbol value should
+ * change if kernel is relocated, make the symbol section relative and
+ * put it inside the section definition.
+ */
+
  #ifdef CONFIG_X86_32
-# include "vmlinux_32.lds.S"
+#define LOAD_OFFSET __PAGE_OFFSET
  #else
-# include "vmlinux_64.lds.S"
+#define LOAD_OFFSET __START_KERNEL_map
  #endif
+
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/page_types.h>
+#include <asm/cache.h>
+#include <asm/boot.h>
+
+#undef i386     /* in case the preprocessor is a 32bit one */
+
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+
+#ifdef CONFIG_X86_32
+OUTPUT_ARCH(i386)
+ENTRY(phys_startup_32)
+jiffies = jiffies_64;
+#else
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(phys_startup_64)
+jiffies_64 = jiffies;
+#endif
+
+PHDRS {
+       text PT_LOAD FLAGS(5);          /* R_E */
+       data PT_LOAD FLAGS(7);          /* RWE */
+#ifdef CONFIG_X86_64
+       user PT_LOAD FLAGS(7);          /* RWE */
+       data.init PT_LOAD FLAGS(7);     /* RWE */
+#ifdef CONFIG_SMP
+       percpu PT_LOAD FLAGS(7);        /* RWE */
+#endif
+       data.init2 PT_LOAD FLAGS(7);    /* RWE */
+#endif
+       note PT_NOTE FLAGS(0);          /* ___ */
+}
+
+SECTIONS
+{
+#ifdef CONFIG_X86_32
+        . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+        phys_startup_32 = startup_32 - LOAD_OFFSET;
+#else
+        . = __START_KERNEL;
+        phys_startup_64 = startup_64 - LOAD_OFFSET;
+#endif
+
+       /* Text and read-only data */
+
+       /* bootstrapping code */
+       .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
+               _text = .;
+               *(.text.head)
+       } :text = 0x9090
+
+       /* The rest of the text */
+       .text :  AT(ADDR(.text) - LOAD_OFFSET) {
+#ifdef CONFIG_X86_32
+               /* not really needed, already page aligned */
+               . = ALIGN(PAGE_SIZE);
+               *(.text.page_aligned)
+#endif
+               . = ALIGN(8);
+               _stext = .;
+               TEXT_TEXT
+               SCHED_TEXT
+               LOCK_TEXT
+               KPROBES_TEXT
+               IRQENTRY_TEXT
+               *(.fixup)
+               *(.gnu.warning)
+               /* End of text section */
+               _etext = .;
+       } :text = 0x9090
+
+       NOTES :text :note
+
+       /* Exception table */
+       . = ALIGN(16);
+       __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+               __start___ex_table = .;
+               *(__ex_table)
+               __stop___ex_table = .;
+       } :text = 0x9090
+
+       RODATA
+
+       /* Data */
+       . = ALIGN(PAGE_SIZE);
+       .data : AT(ADDR(.data) - LOAD_OFFSET) {
+               DATA_DATA
+               CONSTRUCTORS
+
+#ifdef CONFIG_X86_64
+               /* End of data section */
+               _edata = .;
+#endif
+       } :data
+
+#ifdef CONFIG_X86_32
+       /* 32 bit has nosave before _edata */
+       . = ALIGN(PAGE_SIZE);
+       .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+               __nosave_begin = .;
+               *(.data.nosave)
+               . = ALIGN(PAGE_SIZE);
+               __nosave_end = .;
+       }
+#endif
+
+       . = ALIGN(PAGE_SIZE);
+       .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+               *(.data.page_aligned)
+               *(.data.idt)
+       }
+
+#ifdef CONFIG_X86_32
+       . = ALIGN(32);
+#else
+       . = ALIGN(PAGE_SIZE);
+       . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+#endif
+       .data.cacheline_aligned :
+               AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+               *(.data.cacheline_aligned)
+       }
+
+       /* rarely changed data like cpu maps */
+#ifdef CONFIG_X86_32
+       . = ALIGN(32);
+#else
+       . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
+#endif
+       .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+               *(.data.read_mostly)
+
+#ifdef CONFIG_X86_32
+               /* End of data section */
+               _edata = .;
+#endif
+       }
+
+#ifdef CONFIG_X86_64
+
+#define VSYSCALL_ADDR (-10*1024*1024)
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
+                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
+                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+
+#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
+#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
+
+#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
+#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
+
+       . = VSYSCALL_ADDR;
+       .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
+               *(.vsyscall_0)
+       } :user
+
+       __vsyscall_0 = VSYSCALL_VIRT_ADDR;
+
+       . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+       .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
+               *(.vsyscall_fn)
+       }
+
+       . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+       .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
+               *(.vsyscall_gtod_data)
+       }
+
+       vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
+       .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
+               *(.vsyscall_clock)
+       }
+       vsyscall_clock = VVIRT(.vsyscall_clock);
+
+
+       .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
+               *(.vsyscall_1)
+       }
+       .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
+               *(.vsyscall_2)
+       }
+
+       .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
+               *(.vgetcpu_mode)
+       }
+       vgetcpu_mode = VVIRT(.vgetcpu_mode);
+
+       . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+       .jiffies : AT(VLOAD(.jiffies)) {
+               *(.jiffies)
+       }
+       jiffies = VVIRT(.jiffies);
+
+       .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
+               *(.vsyscall_3)
+       }
+
+       . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
+
+#undef VSYSCALL_ADDR
+#undef VSYSCALL_PHYS_ADDR
+#undef VSYSCALL_VIRT_ADDR
+#undef VLOAD_OFFSET
+#undef VLOAD
+#undef VVIRT_OFFSET
+#undef VVIRT
+
+#endif /* CONFIG_X86_64 */
+
+       /* init_task */
+       . = ALIGN(THREAD_SIZE);
+       .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+               *(.data.init_task)
+       }
+#ifdef CONFIG_X86_64
+        :data.init
+#endif
+
+       /*
+        * smp_locks might be freed after init
+        * start/end must be page aligned
+        */
+       . = ALIGN(PAGE_SIZE);
+       .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+               __smp_locks = .;
+               *(.smp_locks)
+               __smp_locks_end = .;
+               . = ALIGN(PAGE_SIZE);
+       }
+
+       /* Init code and data - will be freed after init */
+       . = ALIGN(PAGE_SIZE);
+       .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+               __init_begin = .; /* paired with __init_end */
+               _sinittext = .;
+               INIT_TEXT
+               _einittext = .;
+       }
+
+       .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
+               INIT_DATA
+       }
+
+       . = ALIGN(16);
+       .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+               __setup_start = .;
+               *(.init.setup)
+               __setup_end = .;
+       }
+       .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+               __initcall_start = .;
+               INITCALLS
+               __initcall_end = .;
+       }
+
+       .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+               __con_initcall_start = .;
+               *(.con_initcall.init)
+               __con_initcall_end = .;
+       }
+
+       .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+               __x86_cpu_dev_start = .;
+               *(.x86_cpu_dev.init)
+               __x86_cpu_dev_end = .;
+       }
+
+       SECURITY_INIT
+
+       . = ALIGN(8);
+       .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+               __parainstructions = .;
+               *(.parainstructions)
+               __parainstructions_end = .;
+       }
+
+       . = ALIGN(8);
+       .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+               __alt_instructions = .;
+               *(.altinstructions)
+               __alt_instructions_end = .;
+       }
+
+       .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+               *(.altinstr_replacement)
+       }
+
+       /*
+        * .exit.text is discard at runtime, not link time, to deal with
+        *  references from .altinstructions and .eh_frame
+        */
+       .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+               EXIT_TEXT
+       }
+
+       .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
+               EXIT_DATA
+       }
+
+#ifdef CONFIG_BLK_DEV_INITRD
+       . = ALIGN(PAGE_SIZE);
+       .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+               __initramfs_start = .;
+               *(.init.ramfs)
+               __initramfs_end = .;
+       }
+#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
+       /*
+        * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
+        * output PHDR, so the next output section - __data_nosave - should
+        * start another section data.init2.  Also, pda should be at the head of
+        * percpu area.  Preallocate it and define the percpu offset symbol
+        * so that it can be accessed as a percpu variable.
+        */
+       . = ALIGN(PAGE_SIZE);
+       PERCPU_VADDR(0, :percpu)
+#else
+       PERCPU(PAGE_SIZE)
+#endif
+
+       . = ALIGN(PAGE_SIZE);
+
+       /* freed after init ends here */
+       .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) {
+               __init_end = .;
+       }
+
+#ifdef CONFIG_X86_64
+       .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+               . = ALIGN(PAGE_SIZE);
+               __nosave_begin = .;
+               *(.data.nosave)
+               . = ALIGN(PAGE_SIZE);
+               __nosave_end = .;
+       } :data.init2
+       /* use another section data.init2, see PERCPU_VADDR() above */
+#endif
+
+       /* BSS */
+       . = ALIGN(PAGE_SIZE);
+       .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+               __bss_start = .;
+               *(.bss.page_aligned)
+               *(.bss)
+               . = ALIGN(4);
+               __bss_stop = .;
+       }
+
+       . = ALIGN(PAGE_SIZE);
+       .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+               __brk_base = .;
+               . += 64 * 1024;         /* 64k alignment slop space */
+               *(.brk_reservation)     /* areas brk users have reserved */
+               __brk_limit = .;
+       }
+
+       .end : AT(ADDR(.end) - LOAD_OFFSET) {
+               _end = .;
+       }
+
+       /* Sections to be discarded */
+       /DISCARD/ : {
+               *(.exitcall.exit)
+               *(.eh_frame)
+               *(.discard)
+       }
+
+        STABS_DEBUG
+        DWARF_DEBUG
+}
+
+
+#ifdef CONFIG_X86_32
+ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+        "kernel image bigger than KERNEL_IMAGE_SIZE")
+#else
+/*
+ * Per-cpu symbols which need to be offset from __per_cpu_load
+ * for the boot processor.
+ */
+#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
+INIT_PER_CPU(gdt_page);
+INIT_PER_CPU(irq_stack_union);
+
+/*
+ * Build-time check on the image size:
+ */
+ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
+       "kernel image bigger than KERNEL_IMAGE_SIZE")
+
+#ifdef CONFIG_SMP
+ASSERT((per_cpu__irq_stack_union == 0),
+        "irq_stack_union is not at start of per-cpu area");
+#endif
+
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_KEXEC
+#include <asm/kexec.h>
+
+ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+       "kexec control code size is too big")
+#endif
+
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S

deleted file mode 100644 (file)

index 62ad500..0000000
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ /dev/null
@@ -1,229 +0,0 @@
-/* ld script to make i386 Linux kernel
- * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
- *
- * Don't define absolute symbols until and unless you know that symbol
- * value is should remain constant even if kernel image is relocated
- * at run time. Absolute symbols are not relocated. If symbol value should
- * change if kernel is relocated, make the symbol section relative and
- * put it inside the section definition.
- */
-
-#define LOAD_OFFSET __PAGE_OFFSET
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/thread_info.h>
-#include <asm/page_types.h>
-#include <asm/cache.h>
-#include <asm/boot.h>
-
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(phys_startup_32)
-jiffies = jiffies_64;
-
-PHDRS {
-       text PT_LOAD FLAGS(5);  /* R_E */
-       data PT_LOAD FLAGS(7);  /* RWE */
-       note PT_NOTE FLAGS(0);  /* ___ */
-}
-SECTIONS
-{
-  . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
-  phys_startup_32 = startup_32 - LOAD_OFFSET;
-
-  .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
-       _text = .;                      /* Text and read-only data */
-       *(.text.head)
-  } :text = 0x9090
-
-  /* read-only */
-  .text : AT(ADDR(.text) - LOAD_OFFSET) {
-       . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */
-       *(.text.page_aligned)
-       TEXT_TEXT
-       SCHED_TEXT
-       LOCK_TEXT
-       KPROBES_TEXT
-       IRQENTRY_TEXT
-       *(.fixup)
-       *(.gnu.warning)
-       _etext = .;                     /* End of text section */
-  } :text = 0x9090
-
-  NOTES :text :note
-
-  . = ALIGN(16);               /* Exception table */
-  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
-       __start___ex_table = .;
-        *(__ex_table)
-       __stop___ex_table = .;
-  } :text = 0x9090
-
-  RODATA
-
-  /* writeable */
-  . = ALIGN(PAGE_SIZE);
-  .data : AT(ADDR(.data) - LOAD_OFFSET) {      /* Data */
-       DATA_DATA
-       CONSTRUCTORS
-       } :data
-
-  . = ALIGN(PAGE_SIZE);
-  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-       __nosave_begin = .;
-       *(.data.nosave)
-       . = ALIGN(PAGE_SIZE);
-       __nosave_end = .;
-  }
-
-  . = ALIGN(PAGE_SIZE);
-  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
-       *(.data.page_aligned)
-       *(.data.idt)
-  }
-
-  . = ALIGN(32);
-  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
-       *(.data.cacheline_aligned)
-  }
-
-  /* rarely changed data like cpu maps */
-  . = ALIGN(32);
-  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
-       *(.data.read_mostly)
-       _edata = .;             /* End of data section */
-  }
-
-  . = ALIGN(THREAD_SIZE);      /* init_task */
-  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
-       *(.data.init_task)
-  }
-
-  /* might get freed after init */
-  . = ALIGN(PAGE_SIZE);
-  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
-       __smp_locks = .;
-       *(.smp_locks)
-       __smp_locks_end = .;
-  }
-  /* will be freed after init
-   * Following ALIGN() is required to make sure no other data falls on the
-   * same page where __smp_alt_end is pointing as that page might be freed
-   * after boot. Always make sure that ALIGN() directive is present after
-   * the section which contains __smp_alt_end.
-   */
-  . = ALIGN(PAGE_SIZE);
-
-  /* will be freed after init */
-  . = ALIGN(PAGE_SIZE);                /* Init code and data */
-  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-       __init_begin = .;
-       _sinittext = .;
-       INIT_TEXT
-       _einittext = .;
-  }
-  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
-       INIT_DATA
-  }
-  . = ALIGN(16);
-  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
-       __setup_start = .;
-       *(.init.setup)
-       __setup_end = .;
-   }
-  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
-       __initcall_start = .;
-       INITCALLS
-       __initcall_end = .;
-  }
-  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
-       __con_initcall_start = .;
-       *(.con_initcall.init)
-       __con_initcall_end = .;
-  }
-  .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
-       __x86_cpu_dev_start = .;
-       *(.x86_cpu_dev.init)
-       __x86_cpu_dev_end = .;
-  }
-  SECURITY_INIT
-  . = ALIGN(4);
-  .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
-       __alt_instructions = .;
-       *(.altinstructions)
-       __alt_instructions_end = .;
-  }
-  .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
-       *(.altinstr_replacement)
-  }
-  . = ALIGN(4);
-  .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
-       __parainstructions = .;
-       *(.parainstructions)
-       __parainstructions_end = .;
-  }
-  /* .exit.text is discard at runtime, not link time, to deal with references
-     from .altinstructions and .eh_frame */
-  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
-       EXIT_TEXT
-  }
-  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
-       EXIT_DATA
-  }
-#if defined(CONFIG_BLK_DEV_INITRD)
-  . = ALIGN(PAGE_SIZE);
-  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
-       __initramfs_start = .;
-       *(.init.ramfs)
-       __initramfs_end = .;
-  }
-#endif
-  PERCPU(PAGE_SIZE)
-  . = ALIGN(PAGE_SIZE);
-  /* freed after init ends here */
-
-  .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
-       __init_end = .;
-       __bss_start = .;                /* BSS */
-       *(.bss.page_aligned)
-       *(.bss)
-       . = ALIGN(4);
-       __bss_stop = .;
-  }
-
-  .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
-       . = ALIGN(PAGE_SIZE);
-       __brk_base = . ;
-       . += 64 * 1024 ;        /* 64k alignment slop space */
-       *(.brk_reservation)     /* areas brk users have reserved */
-       __brk_limit = . ;
-  }
-
-  .end : AT(ADDR(.end) - LOAD_OFFSET) {
-       _end = . ;
-  }
-
-  /* Sections to be discarded */
-  /DISCARD/ : {
-       *(.exitcall.exit)
-       *(.discard)
-       }
-
-  STABS_DEBUG
-
-  DWARF_DEBUG
-}
-
-/*
- * Build-time check on the image size:
- */
-ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
-       "kernel image bigger than KERNEL_IMAGE_SIZE")
-
-#ifdef CONFIG_KEXEC
-/* Link time checks */
-#include <asm/kexec.h>
-
-ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
-       "kexec control code size is too big")
-#endif
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S

deleted file mode 100644 (file)

index c874250..0000000
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ /dev/null
@@ -1,298 +0,0 @@
-/* ld script to make x86-64 Linux kernel
- * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
- */
-
-#define LOAD_OFFSET __START_KERNEL_map
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/asm-offsets.h>
-#include <asm/page_types.h>
-
-#undef i386    /* in case the preprocessor is a 32bit one */
-
-OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
-OUTPUT_ARCH(i386:x86-64)
-ENTRY(phys_startup_64)
-jiffies_64 = jiffies;
-PHDRS {
-       text PT_LOAD FLAGS(5);  /* R_E */
-       data PT_LOAD FLAGS(7);  /* RWE */
-       user PT_LOAD FLAGS(7);  /* RWE */
-       data.init PT_LOAD FLAGS(7);     /* RWE */
-#ifdef CONFIG_SMP
-       percpu PT_LOAD FLAGS(7);        /* RWE */
-#endif
-       data.init2 PT_LOAD FLAGS(7);    /* RWE */
-       note PT_NOTE FLAGS(0);  /* ___ */
-}
-SECTIONS
-{
-  . = __START_KERNEL;
-  phys_startup_64 = startup_64 - LOAD_OFFSET;
-  .text :  AT(ADDR(.text) - LOAD_OFFSET) {
-       _text = .;                      /* Text and read-only data */
-       /* First the code that has to be first for bootstrapping */
-       *(.text.head)
-       _stext = .;
-       /* Then the rest */
-       TEXT_TEXT
-       SCHED_TEXT
-       LOCK_TEXT
-       KPROBES_TEXT
-       IRQENTRY_TEXT
-       *(.fixup)
-       *(.gnu.warning)
-       _etext = .;             /* End of text section */
-  } :text = 0x9090
-
-  NOTES :text :note
-
-  . = ALIGN(16);               /* Exception table */
-  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
-       __start___ex_table = .;
-        *(__ex_table)
-       __stop___ex_table = .;
-  } :text = 0x9090
-
-  RODATA
-
-  . = ALIGN(PAGE_SIZE);                /* Align data segment to page size boundary */
-                               /* Data */
-  .data : AT(ADDR(.data) - LOAD_OFFSET) {
-       DATA_DATA
-       CONSTRUCTORS
-       _edata = .;                     /* End of data section */
-       } :data
-
-
-  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
-       . = ALIGN(PAGE_SIZE);
-       . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-       *(.data.cacheline_aligned)
-  }
-  . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
-  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
-       *(.data.read_mostly)
-  }
-
-#define VSYSCALL_ADDR (-10*1024*1024)
-#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-
-#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
-#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
-
-#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
-#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
-
-  . = VSYSCALL_ADDR;
-  .vsyscall_0 :         AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
-  __vsyscall_0 = VSYSCALL_VIRT_ADDR;
-
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
-               { *(.vsyscall_gtod_data) }
-  vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
-  .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
-               { *(.vsyscall_clock) }
-  vsyscall_clock = VVIRT(.vsyscall_clock);
-
-
-  .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
-               { *(.vsyscall_1) }
-  .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
-               { *(.vsyscall_2) }
-
-  .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
-  vgetcpu_mode = VVIRT(.vgetcpu_mode);
-
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
-  jiffies = VVIRT(.jiffies);
-
-  .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
-               { *(.vsyscall_3) }
-
-  . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
-
-#undef VSYSCALL_ADDR
-#undef VSYSCALL_PHYS_ADDR
-#undef VSYSCALL_VIRT_ADDR
-#undef VLOAD_OFFSET
-#undef VLOAD
-#undef VVIRT_OFFSET
-#undef VVIRT
-
-  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
-       . = ALIGN(THREAD_SIZE); /* init_task */
-       *(.data.init_task)
-  }:data.init
-
-  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
-       . = ALIGN(PAGE_SIZE);
-       *(.data.page_aligned)
-  }
-
-  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
-       /* might get freed after init */
-       . = ALIGN(PAGE_SIZE);
-       __smp_alt_begin = .;
-       __smp_locks = .;
-       *(.smp_locks)
-       __smp_locks_end = .;
-       . = ALIGN(PAGE_SIZE);
-       __smp_alt_end = .;
-  }
-
-  . = ALIGN(PAGE_SIZE);                /* Init code and data */
-  __init_begin = .;    /* paired with __init_end */
-  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-       _sinittext = .;
-       INIT_TEXT
-       _einittext = .;
-  }
-  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
-       __initdata_begin = .;
-       INIT_DATA
-       __initdata_end = .;
-   }
-
-  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
-       . = ALIGN(16);
-       __setup_start = .;
-       *(.init.setup)
-       __setup_end = .;
-  }
-  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
-       __initcall_start = .;
-       INITCALLS
-       __initcall_end = .;
-  }
-  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
-       __con_initcall_start = .;
-       *(.con_initcall.init)
-       __con_initcall_end = .;
-  }
-  .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
-       __x86_cpu_dev_start = .;
-       *(.x86_cpu_dev.init)
-       __x86_cpu_dev_end = .;
-  }
-  SECURITY_INIT
-
-  . = ALIGN(8);
-  .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
-       __parainstructions = .;
-       *(.parainstructions)
-       __parainstructions_end = .;
-  }
-
-  .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
-       . = ALIGN(8);
-       __alt_instructions = .;
-       *(.altinstructions)
-       __alt_instructions_end = .;
-  }
-  .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
-       *(.altinstr_replacement)
-  }
-  /* .exit.text is discard at runtime, not link time, to deal with references
-     from .altinstructions and .eh_frame */
-  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
-       EXIT_TEXT
-  }
-  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
-       EXIT_DATA
-  }
-
-#ifdef CONFIG_BLK_DEV_INITRD
-  . = ALIGN(PAGE_SIZE);
-  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
-       __initramfs_start = .;
-       *(.init.ramfs)
-       __initramfs_end = .;
-  }
-#endif
-
-#ifdef CONFIG_SMP
-  /*
-   * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
-   * output PHDR, so the next output section - __data_nosave - should
-   * start another section data.init2.  Also, pda should be at the head of
-   * percpu area.  Preallocate it and define the percpu offset symbol
-   * so that it can be accessed as a percpu variable.
-   */
-  . = ALIGN(PAGE_SIZE);
-  PERCPU_VADDR(0, :percpu)
-#else
-  PERCPU(PAGE_SIZE)
-#endif
-
-  . = ALIGN(PAGE_SIZE);
-  __init_end = .;
-
-  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-       . = ALIGN(PAGE_SIZE);
-       __nosave_begin = .;
-       *(.data.nosave)
-       . = ALIGN(PAGE_SIZE);
-       __nosave_end = .;
-  } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
-
-  .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
-       . = ALIGN(PAGE_SIZE);
-       __bss_start = .;                /* BSS */
-       *(.bss.page_aligned)
-       *(.bss)
-       __bss_stop = .;
-  }
-
-  .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
-       . = ALIGN(PAGE_SIZE);
-       __brk_base = . ;
-       . += 64 * 1024 ;        /* 64k alignment slop space */
-       *(.brk_reservation)     /* areas brk users have reserved */
-       __brk_limit = . ;
-  }
-
-  _end = . ;
-
-  /* Sections to be discarded */
-  /DISCARD/ : {
-       *(.exitcall.exit)
-       *(.eh_frame)
-       *(.discard)
-       }
-
-  STABS_DEBUG
-
-  DWARF_DEBUG
-}
-
- /*
-  * Per-cpu symbols which need to be offset from __per_cpu_load
-  * for the boot processor.
-  */
-#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
-INIT_PER_CPU(gdt_page);
-INIT_PER_CPU(irq_stack_union);
-
-/*
- * Build-time check on the image size:
- */
-ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
-       "kernel image bigger than KERNEL_IMAGE_SIZE")
-
-#ifdef CONFIG_SMP
-ASSERT((per_cpu__irq_stack_union == 0),
-        "irq_stack_union is not at start of per-cpu area");
-#endif
-
-#ifdef CONFIG_KEXEC
-#include <asm/kexec.h>
-
-ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
-       "kexec control code size is too big")
-#endif
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c

index 44153afc9067558cef387ba3237ffcf439ca3800..25ee06a80aad3cd116292227826f9a32fa4b3f7a 100644 (file)
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
                         return;
                 }
  
-               /*
-                * Surround the RDTSC by barriers, to make sure it's not
-                * speculated to outside the seqlock critical section and
-                * does not cause time warps:
-                */
-               rdtsc_barrier();
                 now = vread();
-               rdtsc_barrier();
-
                 base = __vsyscall_gtod_data.clock.cycle_last;
                 mask = __vsyscall_gtod_data.clock.mask;
                 mult = __vsyscall_gtod_data.clock.mult;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c

index 33a93b41739612cafe1fe3d918c39a3915f86a78..4e0c265593958fcb70a8588d942a27a7d0ac3da6 100644 (file)
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -167,10 +167,16 @@ static void lazy_hcall3(unsigned long call,
  
  /* When lazy mode is turned off reset the per-cpu lazy mode variable and then
   * issue the do-nothing hypercall to flush any stored calls. */
-static void lguest_leave_lazy_mode(void)
+static void lguest_leave_lazy_mmu_mode(void)
  {
-       paravirt_leave_lazy(paravirt_get_lazy_mode());
         kvm_hypercall0(LHCALL_FLUSH_ASYNC);
+       paravirt_leave_lazy_mmu();
+}
+
+static void lguest_end_context_switch(struct task_struct *next)
+{
+       kvm_hypercall0(LHCALL_FLUSH_ASYNC);
+       paravirt_end_context_switch(next);
  }
  
  /*G:033
@@ -637,7 +643,7 @@ static void __init lguest_init_IRQ(void)
  
  void lguest_setup_irq(unsigned int irq)
  {
-       irq_to_desc_alloc_cpu(irq, 0);
+       irq_to_desc_alloc_node(irq, 0);
         set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
                                       handle_level_irq, "level");
  }
@@ -1054,8 +1060,8 @@ __init void lguest_init(void)
         pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
         pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
         pv_cpu_ops.wbinvd = lguest_wbinvd;
-       pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu;
-       pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+       pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
+       pv_cpu_ops.end_context_switch = lguest_end_context_switch;
  
         /* pagetable management */
         pv_mmu_ops.write_cr3 = lguest_write_cr3;
@@ -1068,7 +1074,7 @@ __init void lguest_init(void)
         pv_mmu_ops.read_cr2 = lguest_read_cr2;
         pv_mmu_ops.read_cr3 = lguest_read_cr3;
         pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
-       pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+       pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
         pv_mmu_ops.pte_update = lguest_pte_update;
         pv_mmu_ops.pte_update_defer = lguest_pte_update;
  
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c

index e7277cbcfb40ee1ea455fb63c4c6665656724013..a725b7f760ae4ba860e93b9af99b40931621430b 100644 (file)
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -161,13 +161,14 @@ static void note_page(struct seq_file *m, struct pg_state *st,
                    st->current_address >= st->marker[1].start_address) {
                 const char *unit = units;
                 unsigned long delta;
+               int width = sizeof(unsigned long) * 2;
  
                 /*
                  * Now print the actual finished series
                  */
-               seq_printf(m, "0x%p-0x%p   ",
-                          (void *)st->start_address,
-                          (void *)st->current_address);
+               seq_printf(m, "0x%0*lx-0x%0*lx   ",
+                          width, st->start_address,
+                          width, st->current_address);
  
                 delta = (st->current_address - st->start_address) >> 10;
                 while (!(delta & 1023) && unit[1]) {
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c

index a03b7279efa018850def9042339fec4af4249042..c6acc632637417c193394da4881fa19112ace761 100644 (file)
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -3,40 +3,17 @@
   *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
   *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
   */
-#include <linux/interrupt.h>
-#include <linux/mmiotrace.h>
-#include <linux/bootmem.h>
-#include <linux/compiler.h>
-#include <linux/highmem.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/vt_kern.h>
-#include <linux/signal.h>
-#include <linux/kernel.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-#include <linux/errno.h>
-#include <linux/magic.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/mman.h>
-#include <linux/tty.h>
-#include <linux/smp.h>
-#include <linux/mm.h>
-
-#include <asm-generic/sections.h>
-
-#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
-#include <asm/segment.h>
-#include <asm/system.h>
-#include <asm/proto.h>
-#include <asm/traps.h>
-#include <asm/desc.h>
+#include <linux/magic.h>               /* STACK_END_MAGIC              */
+#include <linux/sched.h>               /* test_thread_flag(), ...      */
+#include <linux/kdebug.h>              /* oops_begin/end, ...          */
+#include <linux/module.h>              /* search_exception_table       */
+#include <linux/bootmem.h>             /* max_low_pfn                  */
+#include <linux/kprobes.h>             /* __kprobes, ...               */
+#include <linux/mmiotrace.h>           /* kmmio_handler, ...           */
+#include <linux/perf_counter.h>                /* perf_swcounter_event         */
+
+#include <asm/traps.h>                 /* dotraplinkage, ...           */
+#include <asm/pgalloc.h>               /* pgd_*(), ...                 */
  
  /*
   * Page fault error code bits:
@@ -225,12 +202,10 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
         if (!pmd_present(*pmd_k))
                 return NULL;
  
-       if (!pmd_present(*pmd)) {
+       if (!pmd_present(*pmd))
                 set_pmd(pmd, *pmd_k);
-               arch_flush_lazy_mmu_mode();
-       } else {
+       else
                 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
-       }
  
         return pmd_k;
  }
@@ -538,8 +513,6 @@ bad:
  static int is_errata93(struct pt_regs *regs, unsigned long address)
  {
  #ifdef CONFIG_X86_64
-       static int once;
-
         if (address != regs->ip)
                 return 0;
  
@@ -549,10 +522,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
         address |= 0xffffffffUL << 32;
         if ((address >= (u64)_stext && address <= (u64)_etext) ||
             (address >= MODULES_VADDR && address <= MODULES_END)) {
-               if (!once) {
-                       printk(errata93_warning);
-                       once = 1;
-               }
+               printk_once(errata93_warning);
                 regs->ip = address;
                 return 1;
         }
@@ -1044,6 +1014,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
         if (unlikely(error_code & PF_RSVD))
                 pgtable_bad(regs, error_code, address);
  
+       perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+
         /*
          * If we're in an interrupt, have no user context or are running
          * in an atomic region then we must not take the fault:
@@ -1137,10 +1109,15 @@ good_area:
                 return;
         }
  
-       if (fault & VM_FAULT_MAJOR)
+       if (fault & VM_FAULT_MAJOR) {
                 tsk->maj_flt++;
-       else
+               perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+                                    regs, address);
+       } else {
                 tsk->min_flt++;
+               perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+                                    regs, address);
+       }
  
         check_v8086_mode(regs, address, tsk);
  
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c

index 8126e8d1a2a4a789509cb49af563b6cbb76395ae..58f621e8191955c2e02016df1d8e99bec8e1ed8f 100644 (file)
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -44,7 +44,6 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
         BUG_ON(!pte_none(*(kmap_pte-idx)));
         set_pte(kmap_pte-idx, mk_pte(page, prot));
-       arch_flush_lazy_mmu_mode();
  
         return (void *)vaddr;
  }
@@ -74,7 +73,6 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
  #endif
         }
  
-       arch_flush_lazy_mmu_mode();
         pagefault_enable();
  }
  
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c

index ae4f7b5d71040566f7b30af16e6c00f20c82c098..34c1bfb64f1ca07d80838f363b514caf07acf4db 100644 (file)
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,3 +1,4 @@
+#include <linux/initrd.h>
  #include <linux/ioport.h>
  #include <linux/swap.h>
  
@@ -10,6 +11,9 @@
  #include <asm/setup.h>
  #include <asm/system.h>
  #include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
  
  unsigned long __initdata e820_table_start;
  unsigned long __meminitdata e820_table_end;
@@ -23,6 +27,69 @@ int direct_gbpages
  #endif
  ;
  
+int nx_enabled;
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static int disable_nx __cpuinitdata;
+
+/*
+ * noexec = on|off
+ *
+ * Control non-executable mappings for processes.
+ *
+ * on      Enable
+ * off     Disable
+ */
+static int __init noexec_setup(char *str)
+{
+       if (!str)
+               return -EINVAL;
+       if (!strncmp(str, "on", 2)) {
+               __supported_pte_mask |= _PAGE_NX;
+               disable_nx = 0;
+       } else if (!strncmp(str, "off", 3)) {
+               disable_nx = 1;
+               __supported_pte_mask &= ~_PAGE_NX;
+       }
+       return 0;
+}
+early_param("noexec", noexec_setup);
+#endif
+
+#ifdef CONFIG_X86_PAE
+static void __init set_nx(void)
+{
+       unsigned int v[4], l, h;
+
+       if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
+               cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+
+               if ((v[3] & (1 << 20)) && !disable_nx) {
+                       rdmsr(MSR_EFER, l, h);
+                       l |= EFER_NX;
+                       wrmsr(MSR_EFER, l, h);
+                       nx_enabled = 1;
+                       __supported_pte_mask |= _PAGE_NX;
+               }
+       }
+}
+#else
+static inline void set_nx(void)
+{
+}
+#endif
+
+#ifdef CONFIG_X86_64
+void __cpuinit check_efer(void)
+{
+       unsigned long efer;
+
+       rdmsrl(MSR_EFER, efer);
+       if (!(efer & EFER_NX) || disable_nx)
+               __supported_pte_mask &= ~_PAGE_NX;
+}
+#endif
+
  static void __init find_early_table_space(unsigned long end, int use_pse,
                                           int use_gbpages)
  {
@@ -66,12 +133,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
          */
  #ifdef CONFIG_X86_32
         start = 0x7000;
-       e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
-                                       tables, PAGE_SIZE);
-#else /* CONFIG_X86_64 */
+#else
         start = 0x8000;
-       e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
  #endif
+       e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+                                       tables, PAGE_SIZE);
         if (e820_table_start == -1UL)
                 panic("Cannot find space for the kernel page tables");
  
@@ -159,12 +225,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
         use_gbpages = direct_gbpages;
  #endif
  
-#ifdef CONFIG_X86_32
-#ifdef CONFIG_X86_PAE
         set_nx();
         if (nx_enabled)
                 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
-#endif
  
         /* Enable PSE if available */
         if (cpu_has_pse)
@@ -175,7 +238,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
                 set_in_cr4(X86_CR4_PGE);
                 __supported_pte_mask |= _PAGE_GLOBAL;
         }
-#endif
  
         if (use_gbpages)
                 page_size_mask |= 1 << PG_LEVEL_1G;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c

index 749559ed80f5d99e1771826155ac8b27e1f2a3f2..949708d7a481ec10e9591faba7466b3e1a75f57d 100644 (file)
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,12 +49,9 @@
  #include <asm/paravirt.h>
  #include <asm/setup.h>
  #include <asm/cacheflush.h>
+#include <asm/page_types.h>
  #include <asm/init.h>
  
-unsigned long max_low_pfn_mapped;
-unsigned long max_pfn_mapped;
-
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
  unsigned long highstart_pfn, highend_pfn;
  
  static noinline int do_test_wp_bit(void);
@@ -587,61 +584,9 @@ void zap_low_mappings(void)
         flush_tlb_all();
  }
  
-int nx_enabled;
-
  pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
  EXPORT_SYMBOL_GPL(__supported_pte_mask);
  
-#ifdef CONFIG_X86_PAE
-
-static int disable_nx __initdata;
-
-/*
- * noexec = on|off
- *
- * Control non executable mappings.
- *
- * on      Enable
- * off     Disable
- */
-static int __init noexec_setup(char *str)
-{
-       if (!str || !strcmp(str, "on")) {
-               if (cpu_has_nx) {
-                       __supported_pte_mask |= _PAGE_NX;
-                       disable_nx = 0;
-               }
-       } else {
-               if (!strcmp(str, "off")) {
-                       disable_nx = 1;
-                       __supported_pte_mask &= ~_PAGE_NX;
-               } else {
-                       return -EINVAL;
-               }
-       }
-
-       return 0;
-}
-early_param("noexec", noexec_setup);
-
-void __init set_nx(void)
-{
-       unsigned int v[4], l, h;
-
-       if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
-               cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
-
-               if ((v[3] & (1 << 20)) && !disable_nx) {
-                       rdmsr(MSR_EFER, l, h);
-                       l |= EFER_NX;
-                       wrmsr(MSR_EFER, l, h);
-                       nx_enabled = 1;
-                       __supported_pte_mask |= _PAGE_NX;
-               }
-       }
-}
-#endif
-
  /* user-defined highmem size */
  static unsigned int highmem_pages = -1;
  
@@ -761,15 +706,15 @@ void __init initmem_init(unsigned long start_pfn,
         highstart_pfn = highend_pfn = max_pfn;
         if (max_pfn > max_low_pfn)
                 highstart_pfn = max_low_pfn;
-       memory_present(0, 0, highend_pfn);
         e820_register_active_regions(0, 0, highend_pfn);
+       sparse_memory_present_with_active_regions(0);
         printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
                 pages_to_mb(highend_pfn - highstart_pfn));
         num_physpages = highend_pfn;
         high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
  #else
-       memory_present(0, 0, max_low_pfn);
         e820_register_active_regions(0, 0, max_low_pfn);
+       sparse_memory_present_with_active_regions(0);
         num_physpages = max_low_pfn;
         high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
  #endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c

index 1753e8020df6ec8aa3eefea7342386224e595f2c..52bb9519bb86b4ec778d613939ea658adb1052a6 100644 (file)
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -50,18 +50,8 @@
  #include <asm/cacheflush.h>
  #include <asm/init.h>
  
-/*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
- */
-unsigned long max_low_pfn_mapped;
-unsigned long max_pfn_mapped;
-
  static unsigned long dma_reserve __initdata;
  
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
  static int __init parse_direct_gbpages_off(char *arg)
  {
         direct_gbpages = 0;
@@ -85,39 +75,6 @@ early_param("gbpages", parse_direct_gbpages_on);
  pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
  EXPORT_SYMBOL_GPL(__supported_pte_mask);
  
-static int disable_nx __cpuinitdata;
-
-/*
- * noexec=on|off
- * Control non-executable mappings for 64-bit processes.
- *
- * on  Enable (default)
- * off Disable
- */
-static int __init nonx_setup(char *str)
-{
-       if (!str)
-               return -EINVAL;
-       if (!strncmp(str, "on", 2)) {
-               __supported_pte_mask |= _PAGE_NX;
-               disable_nx = 0;
-       } else if (!strncmp(str, "off", 3)) {
-               disable_nx = 1;
-               __supported_pte_mask &= ~_PAGE_NX;
-       }
-       return 0;
-}
-early_param("noexec", nonx_setup);
-
-void __cpuinit check_efer(void)
-{
-       unsigned long efer;
-
-       rdmsrl(MSR_EFER, efer);
-       if (!(efer & EFER_NX) || disable_nx)
-               __supported_pte_mask &= ~_PAGE_NX;
-}
-
  int force_personality32;
  
  /*
@@ -628,6 +585,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
         early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
         reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
  }
+#endif
  
  void __init paging_init(void)
  {
@@ -638,11 +596,10 @@ void __init paging_init(void)
         max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
         max_zone_pfns[ZONE_NORMAL] = max_pfn;
  
-       memory_present(0, 0, max_pfn);
+       sparse_memory_present_with_active_regions(MAX_NUMNODES);
         sparse_init();
         free_area_init_nodes(max_zone_pfns);
  }
-#endif
  
  /*
   * Memory hotplug specific functions
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c

index 8056545e2d39f9b53ba56349db4717809d6b9f04..fe6f84ca121ee072d2be8014e2b9b7e959e5abbd 100644 (file)
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -82,7 +82,6 @@ iounmap_atomic(void *kvaddr, enum km_type type)
         if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
                 kpte_clear_flush(kmap_pte-idx, vaddr);
  
-       arch_flush_lazy_mmu_mode();
         pagefault_enable();
  }
  EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c

index 50dc802a1c469b904154be3901f5fc87ebd7da93..16ccbd77917f22c1693b9b41fcb8dc7485acee39 100644 (file)
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -32,7 +32,7 @@ struct kmmio_fault_page {
         struct list_head list;
         struct kmmio_fault_page *release_next;
         unsigned long page; /* location of the fault page */
-       bool old_presence; /* page presence prior to arming */
+       pteval_t old_presence; /* page presence prior to arming */
         bool armed;
  
         /*
@@ -97,60 +97,62 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
  static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
  {
         struct list_head *head;
-       struct kmmio_fault_page *p;
+       struct kmmio_fault_page *f;
  
         page &= PAGE_MASK;
         head = kmmio_page_list(page);
-       list_for_each_entry_rcu(p, head, list) {
-               if (p->page == page)
-                       return p;
+       list_for_each_entry_rcu(f, head, list) {
+               if (f->page == page)
+                       return f;
         }
         return NULL;
  }
  
-static void set_pmd_presence(pmd_t *pmd, bool present, bool *old)
+static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
  {
         pmdval_t v = pmd_val(*pmd);
-       *old = !!(v & _PAGE_PRESENT);
-       v &= ~_PAGE_PRESENT;
-       if (present)
-               v |= _PAGE_PRESENT;
+       if (clear) {
+               *old = v & _PAGE_PRESENT;
+               v &= ~_PAGE_PRESENT;
+       } else  /* presume this has been called with clear==true previously */
+               v |= *old;
         set_pmd(pmd, __pmd(v));
  }
  
-static void set_pte_presence(pte_t *pte, bool present, bool *old)
+static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
  {
         pteval_t v = pte_val(*pte);
-       *old = !!(v & _PAGE_PRESENT);
-       v &= ~_PAGE_PRESENT;
-       if (present)
-               v |= _PAGE_PRESENT;
+       if (clear) {
+               *old = v & _PAGE_PRESENT;
+               v &= ~_PAGE_PRESENT;
+       } else  /* presume this has been called with clear==true previously */
+               v |= *old;
         set_pte_atomic(pte, __pte(v));
  }
  
-static int set_page_presence(unsigned long addr, bool present, bool *old)
+static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
  {
         unsigned int level;
-       pte_t *pte = lookup_address(addr, &level);
+       pte_t *pte = lookup_address(f->page, &level);
  
         if (!pte) {
-               pr_err("kmmio: no pte for page 0x%08lx\n", addr);
+               pr_err("kmmio: no pte for page 0x%08lx\n", f->page);
                 return -1;
         }
  
         switch (level) {
         case PG_LEVEL_2M:
-               set_pmd_presence((pmd_t *)pte, present, old);
+               clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
                 break;
         case PG_LEVEL_4K:
-               set_pte_presence(pte, present, old);
+               clear_pte_presence(pte, clear, &f->old_presence);
                 break;
         default:
                 pr_err("kmmio: unexpected page level 0x%x.\n", level);
                 return -1;
         }
  
-       __flush_tlb_one(addr);
+       __flush_tlb_one(f->page);
         return 0;
  }
  
@@ -171,9 +173,9 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
         WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n");
         if (f->armed) {
                 pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n",
-                                       f->page, f->count, f->old_presence);
+                                       f->page, f->count, !!f->old_presence);
         }
-       ret = set_page_presence(f->page, false, &f->old_presence);
+       ret = clear_page_presence(f, true);
         WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page);
         f->armed = true;
         return ret;
@@ -182,8 +184,7 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
  /** Restore the given page to saved presence state. */
  static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
  {
-       bool tmp;
-       int ret = set_page_presence(f->page, f->old_presence, &tmp);
+       int ret = clear_page_presence(f, false);
         WARN_ONCE(ret < 0,
                         KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
         f->armed = false;
@@ -310,7 +311,12 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
         struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
  
         if (!ctx->active) {
-               pr_debug("kmmio: spurious debug trap on CPU %d.\n",
+               /*
+                * debug traps without an active context are due to either
+                * something external causing them (f.e. using a debugger while
+                * mmio tracing enabled), or erroneous behaviour
+                */
+               pr_warning("kmmio: unexpected debug trap on CPU %d.\n",
                                                         smp_processor_id());
                 goto out;
         }
@@ -439,12 +445,12 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
                                                 head,
                                                 struct kmmio_delayed_release,
                                                 rcu);
-       struct kmmio_fault_page *p = dr->release_list;
-       while (p) {
-               struct kmmio_fault_page *next = p->release_next;
-               BUG_ON(p->count);
-               kfree(p);
-               p = next;
+       struct kmmio_fault_page *f = dr->release_list;
+       while (f) {
+               struct kmmio_fault_page *next = f->release_next;
+               BUG_ON(f->count);
+               kfree(f);
+               f = next;
         }
         kfree(dr);
  }
@@ -453,19 +459,19 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
  {
         struct kmmio_delayed_release *dr =
                 container_of(head, struct kmmio_delayed_release, rcu);
-       struct kmmio_fault_page *p = dr->release_list;
+       struct kmmio_fault_page *f = dr->release_list;
         struct kmmio_fault_page **prevp = &dr->release_list;
         unsigned long flags;
  
         spin_lock_irqsave(&kmmio_lock, flags);
-       while (p) {
-               if (!p->count) {
-                       list_del_rcu(&p->list);
-                       prevp = &p->release_next;
+       while (f) {
+               if (!f->count) {
+                       list_del_rcu(&f->list);
+                       prevp = &f->release_next;
                 } else {
-                       *prevp = p->release_next;
+                       *prevp = f->release_next;
                 }
-               p = p->release_next;
+               f = f->release_next;
         }
         spin_unlock_irqrestore(&kmmio_lock, flags);
  
@@ -528,8 +534,8 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
  }
  EXPORT_SYMBOL(unregister_kmmio_probe);
  
-static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
-                                                               void *args)
+static int
+kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
  {
         struct die_args *arg = args;
  
@@ -544,11 +550,23 @@ static struct notifier_block nb_die = {
         .notifier_call = kmmio_die_notifier
  };
  
-static int __init init_kmmio(void)
+int kmmio_init(void)
  {
         int i;
+
         for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
                 INIT_LIST_HEAD(&kmmio_page_table[i]);
+
         return register_die_notifier(&nb_die);
  }
-fs_initcall(init_kmmio); /* should be before device_initcall() */
+
+void kmmio_cleanup(void)
+{
+       int i;
+
+       unregister_die_notifier(&nb_die);
+       for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
+               WARN_ONCE(!list_empty(&kmmio_page_table[i]),
+                       KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
+       }
+}
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c

index 605c8be06217b0da36344abd5f23d06326bcd87a..c0bedcd10f9733a3a9a21873b9043ead3a87546e 100644 (file)
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -40,23 +40,23 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
  
  static void __init memtest(u64 pattern, u64 start_phys, u64 size)
  {
-       u64 i, count;
-       u64 *start;
+       u64 *p;
+       void *start, *end;
         u64 start_bad, last_bad;
         u64 start_phys_aligned;
         size_t incr;
  
         incr = sizeof(pattern);
         start_phys_aligned = ALIGN(start_phys, incr);
-       count = (size - (start_phys_aligned - start_phys))/incr;
         start = __va(start_phys_aligned);
+       end = start + size - (start_phys_aligned - start_phys);
         start_bad = 0;
         last_bad = 0;
  
-       for (i = 0; i < count; i++)
-               start[i] = pattern;
-       for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
-               if (*start == pattern)
+       for (p = start; p < end; p++)
+               *p = pattern;
+       for (p = start; p < end; p++, start_phys_aligned += incr) {
+               if (*p == pattern)
                         continue;
                 if (start_phys_aligned == last_bad + incr) {
                         last_bad += incr;
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c

index c9342ed8b402dd93cc2e991ac1f44f67a0261c8d..132772a8ec57a66c629ad5719431fa6c08d95d98 100644 (file)
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -451,6 +451,7 @@ void enable_mmiotrace(void)
  
         if (nommiotrace)
                 pr_info(NAME "MMIO tracing disabled.\n");
+       kmmio_init();
         enter_uniprocessor();
         spin_lock_irq(&trace_lock);
         atomic_inc(&mmiotrace_enabled);
@@ -473,6 +474,7 @@ void disable_mmiotrace(void)
  
         clear_trace_list(); /* guarantees: no more kmmio callbacks */
         leave_uniprocessor();
+       kmmio_cleanup();
         pr_info(NAME "disabled.\n");
  out:
         mutex_unlock(&mmiotrace_mutex);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c

index 2d05a12029dc3d216814eb2a9e914b389014407a..459913beac71dc0539a4f0bd11a299df869e5711 100644 (file)
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -179,18 +179,25 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
  }
  
  /* Initialize bootmem allocator for a node */
-void __init setup_node_bootmem(int nodeid, unsigned long start,
-                              unsigned long end)
+void __init
+setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
  {
         unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
+       const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
         unsigned long bootmap_start, nodedata_phys;
         void *bootmap;
-       const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
         int nid;
  
         if (!end)
                 return;
  
+       /*
+        * Don't confuse VM with a node that doesn't have the
+        * minimum amount of memory:
+        */
+       if (end && (end - start) < NODE_MIN_SIZE)
+               return;
+
         start = roundup(start, ZONE_ALIGN);
  
         printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
@@ -272,9 +279,6 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
                 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
                                  bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
  
-#ifdef CONFIG_ACPI_NUMA
-       srat_reserve_add_area(nodeid);
-#endif
         node_set_online(nodeid);
  }
  
@@ -578,21 +582,6 @@ unsigned long __init numa_free_all_bootmem(void)
         return pages;
  }
  
-void __init paging_init(void)
-{
-       unsigned long max_zone_pfns[MAX_NR_ZONES];
-
-       memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-       max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
-       max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
-       max_zone_pfns[ZONE_NORMAL] = max_pfn;
-
-       sparse_memory_present_with_active_regions(MAX_NUMNODES);
-       sparse_init();
-
-       free_area_init_nodes(max_zone_pfns);
-}
-
  static __init int numa_setup(char *opt)
  {
         if (!opt)
@@ -606,8 +595,6 @@ static __init int numa_setup(char *opt)
  #ifdef CONFIG_ACPI_NUMA
         if (!strncmp(opt, "noacpi", 6))
                 acpi_numa = -1;
-       if (!strncmp(opt, "hotadd=", 7))
-               hotadd_percent = simple_strtoul(opt+7, NULL, 10);
  #endif
         return 0;
  }
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c

index e17efed088c54a7b546b7b76c073ab55231d425e..6ce9518fe2acb6457db7d6c19c1739e6f48f2c9b 100644 (file)
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -839,13 +839,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
  
         vm_unmap_aliases();
  
-       /*
-        * If we're called with lazy mmu updates enabled, the
-        * in-memory pte state may be stale.  Flush pending updates to
-        * bring them up to date.
-        */
-       arch_flush_lazy_mmu_mode();
-
         cpa.vaddr = addr;
         cpa.pages = pages;
         cpa.numpages = numpages;
@@ -890,13 +883,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
         } else
                 cpa_flush_all(cache);
  
-       /*
-        * If we've been called with lazy mmu updates enabled, then
-        * make sure that everything gets flushed out before we
-        * return.
-        */
-       arch_flush_lazy_mmu_mode();
-
  out:
         return ret;
  }
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c

index 01765955baaf66922ad70a959c9875f60fcad56e..2dfcbf9df2ae8410228e25fb4f559cd75ed1982e 100644 (file)
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -31,17 +31,11 @@ static nodemask_t nodes_parsed __initdata;
  static nodemask_t cpu_nodes_parsed __initdata;
  static struct bootnode nodes[MAX_NUMNODES] __initdata;
  static struct bootnode nodes_add[MAX_NUMNODES];
-static int found_add_area __initdata;
-int hotadd_percent __initdata = 0;
  
  static int num_node_memblks __initdata;
  static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
  static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
  
-/* Too small nodes confuse the VM badly. Usually they result
-   from BIOS bugs. */
-#define NODE_MIN_SIZE (4*1024*1024)
-
  static __init int setup_node(int pxm)
  {
         return acpi_map_pxm_to_node(pxm);
@@ -66,9 +60,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end)
  {
         struct bootnode *nd = &nodes[i];
  
-       if (found_add_area)
-               return;
-
         if (nd->start < start) {
                 nd->start = start;
                 if (nd->end < nd->start)
@@ -86,7 +77,6 @@ static __init void bad_srat(void)
         int i;
         printk(KERN_ERR "SRAT: SRAT not used.\n");
         acpi_numa = -1;
-       found_add_area = 0;
         for (i = 0; i < MAX_LOCAL_APIC; i++)
                 apicid_to_node[i] = NUMA_NO_NODE;
         for (i = 0; i < MAX_NUMNODES; i++)
@@ -182,24 +172,21 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
                pxm, apic_id, node);
  }
  
-static int update_end_of_memory(unsigned long end) {return -1;}
-static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
  #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
  static inline int save_add_info(void) {return 1;}
  #else
  static inline int save_add_info(void) {return 0;}
  #endif
  /*
- * Update nodes_add and decide if to include add are in the zone.
- * Both SPARSE and RESERVE need nodes_add information.
- * This code supports one contiguous hot add area per node.
+ * Update nodes_add[]
+ * This code supports one contiguous hot add area per node
   */
-static int __init
-reserve_hotadd(int node, unsigned long start, unsigned long end)
+static void __init
+update_nodes_add(int node, unsigned long start, unsigned long end)
  {
         unsigned long s_pfn = start >> PAGE_SHIFT;
         unsigned long e_pfn = end >> PAGE_SHIFT;
-       int ret = 0, changed = 0;
+       int changed = 0;
         struct bootnode *nd = &nodes_add[node];
  
         /* I had some trouble with strange memory hotadd regions breaking
@@ -210,7 +197,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
            mistakes */
         if ((signed long)(end - start) < NODE_MIN_SIZE) {
                 printk(KERN_ERR "SRAT: Hotplug area too small\n");
-               return -1;
+               return;
         }
  
         /* This check might be a bit too strict, but I'm keeping it for now. */
@@ -218,12 +205,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
                 printk(KERN_ERR
                         "SRAT: Hotplug area %lu -> %lu has existing memory\n",
                         s_pfn, e_pfn);
-               return -1;
-       }
-
-       if (!hotadd_enough_memory(&nodes_add[node]))  {
-               printk(KERN_ERR "SRAT: Hotplug area too large\n");
-               return -1;
+               return;
         }
  
         /* Looks good */
@@ -245,11 +227,9 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
                         printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
         }
  
-       ret = update_end_of_memory(nd->end);
-
         if (changed)
-               printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
-       return ret;
+               printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
+                                nd->start, nd->end);
  }
  
  /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
@@ -310,13 +290,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
                start, end);
         e820_register_active_regions(node, start >> PAGE_SHIFT,
                                      end >> PAGE_SHIFT);
-       push_node_boundaries(node, nd->start >> PAGE_SHIFT,
-                                               nd->end >> PAGE_SHIFT);
  
-       if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
-           (reserve_hotadd(node, start, end) < 0)) {
-               /* Ignore hotadd region. Undo damage */
-               printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
+       if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
+               update_nodes_add(node, start, end);
+               /* restore nodes[node] */
                 *nd = oldnode;
                 if ((nd->start | nd->end) == 0)
                         node_clear(node, nodes_parsed);
@@ -345,9 +322,9 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
                         pxmram = 0;
         }
  
-       e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
-       /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
-       if ((long)(e820ram - pxmram) >= 1*1024*1024) {
+       e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
+       /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
+       if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
                 printk(KERN_ERR
         "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
                         (pxmram << PAGE_SHIFT) >> 20,
@@ -357,17 +334,6 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
         return 1;
  }
  
-static void __init unparse_node(int node)
-{
-       int i;
-       node_clear(node, nodes_parsed);
-       node_clear(node, cpu_nodes_parsed);
-       for (i = 0; i < MAX_LOCAL_APIC; i++) {
-               if (apicid_to_node[i] == node)
-                       apicid_to_node[i] = NUMA_NO_NODE;
-       }
-}
-
  void __init acpi_numa_arch_fixup(void) {}
  
  /* Use the information discovered above to actually set up the nodes. */
@@ -379,18 +345,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
                 return -1;
  
         /* First clean up the node list */
-       for (i = 0; i < MAX_NUMNODES; i++) {
+       for (i = 0; i < MAX_NUMNODES; i++)
                 cutoff_node(i, start, end);
-               /*
-                * don't confuse VM with a node that doesn't have the
-                * minimum memory.
-                */
-               if (nodes[i].end &&
-                       (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
-                       unparse_node(i);
-                       node_set_offline(i);
-               }
-       }
  
         if (!nodes_cover_memory(nodes)) {
                 bad_srat();
@@ -423,7 +379,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
  
                 if (node == NUMA_NO_NODE)
                         continue;
-               if (!node_isset(node, node_possible_map))
+               if (!node_online(node))
                         numa_clear_node(i);
         }
         numa_init_array();
@@ -510,26 +466,6 @@ static int null_slit_node_compare(int a, int b)
  }
  #endif /* CONFIG_NUMA_EMU */
  
-void __init srat_reserve_add_area(int nodeid)
-{
-       if (found_add_area && nodes_add[nodeid].end) {
-               u64 total_mb;
-
-               printk(KERN_INFO "SRAT: Reserving hot-add memory space "
-                               "for node %d at %Lx-%Lx\n",
-                       nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
-               total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
-                                       >> PAGE_SHIFT;
-               total_mb *= sizeof(struct page);
-               total_mb >>= 20;
-               printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
-                               "pre-allocated memory.\n", (unsigned long long)total_mb);
-               reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
-                              nodes_add[nodeid].end - nodes_add[nodeid].start,
-                              BOOTMEM_DEFAULT);
-       }
-}
-
  int __node_distance(int a, int b)
  {
         int index;
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c

index 80b63d5db50918b840677f96e952eeed63f7b8c2..7826dfcc842823a7a893b4e2831c34864597b1c7 100644 (file)
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -60,8 +60,9 @@ static int profile_exceptions_notify(struct notifier_block *self,
  
         switch (val) {
         case DIE_NMI:
-               if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)))
-                       ret = NOTIFY_STOP;
+       case DIE_NMI_IPI:
+               model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
+               ret = NOTIFY_STOP;
                 break;
         default:
                 break;
@@ -146,7 +147,7 @@ static void nmi_cpu_setup(void *dummy)
  static struct notifier_block profile_exceptions_nb = {
         .notifier_call = profile_exceptions_notify,
         .next = NULL,
-       .priority = 0
+       .priority = 2
  };
  
  static int nmi_setup(void)
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c

index 0a261a5c696e5bdad2b2c28c058916fffb666b67..cd72d5c73b490c3cc4341c5f5c7eee4b1f076832 100644 (file)
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -118,6 +118,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
         u64 val;
         int i;
  
+       /*
+        * This can happen if perf counters are in use when
+        * we steal the die notifier NMI.
+        */
+       if (unlikely(!reset_value))
+               goto out;
+
         for (i = 0 ; i < num_counters; ++i) {
                 if (!reset_value[i])
                         continue;
@@ -128,6 +135,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
                 wrmsrl(msrs->counters[i].addr, -reset_value[i]);
         }
  
+out:
         /* Only P6 based Pentium M need to re-unmask the apic vector but it
          * doesn't hurt other P6 variant */
         apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h

index fda52b4c1b95bcc90d5aae1ad3bcfcbb22236d07..505489873b9d31f95a356cc3f478c7290a79a082 100644 (file)
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -13,7 +13,7 @@
  #define OP_X86_MODEL_H
  
  #include <asm/types.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
  
  struct op_msr {
         unsigned long   addr;
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c

index fecbce6e7d7c20d1f0af750e2305ef4e9a37a92c..0696d506c4ade99b0d43232128a77cea99d65ec8 100644 (file)
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -889,6 +889,9 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
                 return 0;
         }
  
+       if (io_apic_assign_pci_irqs)
+               return 0;
+
         /* Find IRQ routing entry */
  
         if (!pirq_table)
@@ -1039,56 +1042,15 @@ static void __init pcibios_fixup_irqs(void)
                 pirq_penalty[dev->irq]++;
         }
  
+       if (io_apic_assign_pci_irqs)
+               return;
+
         dev = NULL;
         while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
                 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
                 if (!pin)
                         continue;
  
-#ifdef CONFIG_X86_IO_APIC
-               /*
-                * Recalculate IRQ numbers if we use the I/O APIC.
-                */
-               if (io_apic_assign_pci_irqs) {
-                       int irq;
-
-                       /*
-                        * interrupt pins are numbered starting from 1
-                        */
-                       irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
-                               PCI_SLOT(dev->devfn), pin - 1);
-                       /*
-                        * Busses behind bridges are typically not listed in the
-                        * MP-table.  In this case we have to look up the IRQ
-                        * based on the parent bus, parent slot, and pin number.
-                        * The SMP code detects such bridged busses itself so we
-                        * should get into this branch reliably.
-                        */
-                       if (irq < 0 && dev->bus->parent) {
-                               /* go back to the bridge */
-                               struct pci_dev *bridge = dev->bus->self;
-                               int bus;
-
-                               pin = pci_swizzle_interrupt_pin(dev, pin);
-                               bus = bridge->bus->number;
-                               irq = IO_APIC_get_PCI_irq_vector(bus,
-                                               PCI_SLOT(bridge->devfn), pin - 1);
-                               if (irq >= 0)
-                                       dev_warn(&dev->dev,
-                                               "using bridge %s INT %c to "
-                                                       "get IRQ %d\n",
-                                                pci_name(bridge),
-                                                'A' + pin - 1, irq);
-                       }
-                       if (irq >= 0) {
-                               dev_info(&dev->dev,
-                                       "PCI->APIC IRQ transform: INT %c "
-                                               "-> IRQ %d\n",
-                                       'A' + pin - 1, irq);
-                               dev->irq = irq;
-                       }
-               }
-#endif
                 /*
                  * Still no IRQ? Try to lookup one...
                  */
@@ -1183,6 +1145,19 @@ int __init pcibios_irq_init(void)
         pcibios_enable_irq = pirq_enable_irq;
  
         pcibios_fixup_irqs();
+
+       if (io_apic_assign_pci_irqs && pci_routeirq) {
+               struct pci_dev *dev = NULL;
+               /*
+                * PCI IRQ routing is set up by pci_enable_device(), but we
+                * also do it here in case there are still broken drivers that
+                * don't use pci_enable_device().
+                */
+               printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n");
+               for_each_pci_dev(dev)
+                       pirq_enable_irq(dev);
+       }
+
         return 0;
  }
  
@@ -1213,16 +1188,23 @@ void pcibios_penalize_isa_irq(int irq, int active)
  static int pirq_enable_irq(struct pci_dev *dev)
  {
         u8 pin;
-       struct pci_dev *temp_dev;
  
         pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
-       if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
+       if (pin && !pcibios_lookup_irq(dev, 1)) {
                 char *msg = "";
  
+               if (!io_apic_assign_pci_irqs && dev->irq)
+                       return 0;
+
                 if (io_apic_assign_pci_irqs) {
+#ifdef CONFIG_X86_IO_APIC
+                       struct pci_dev *temp_dev;
                         int irq;
+                       struct io_apic_irq_attr irq_attr;
  
-                       irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin - 1);
+                       irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
+                                               PCI_SLOT(dev->devfn),
+                                               pin - 1, &irq_attr);
                         /*
                          * Busses behind bridges are typically not listed in the MP-table.
                          * In this case we have to look up the IRQ based on the parent bus,
@@ -1235,7 +1217,8 @@ static int pirq_enable_irq(struct pci_dev *dev)
  
                                 pin = pci_swizzle_interrupt_pin(dev, pin);
                                 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
-                                               PCI_SLOT(bridge->devfn), pin - 1);
+                                               PCI_SLOT(bridge->devfn),
+                                               pin - 1, &irq_attr);
                                 if (irq >= 0)
                                         dev_warn(&dev->dev, "using bridge %s "
                                                  "INT %c to get IRQ %d\n",
@@ -1245,12 +1228,15 @@ static int pirq_enable_irq(struct pci_dev *dev)
                         }
                         dev = temp_dev;
                         if (irq >= 0) {
+                               io_apic_set_pci_routing(&dev->dev, irq,
+                                                        &irq_attr);
+                               dev->irq = irq;
                                 dev_info(&dev->dev, "PCI->APIC IRQ transform: "
                                          "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
-                               dev->irq = irq;
                                 return 0;
                         } else
                                 msg = "; probably buggy MP table";
+#endif
                 } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
                         msg = "";
                 else
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c

index 1241f118ab561b181b5dd9634f5c80d7783de493..58bc00f68b12edb451d3f619db81003925f85f47 100644 (file)
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -338,6 +338,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
                 }
         }
  
+       current->mm->context.vdso = (void *)addr;
+
         if (compat_uses_vma || !compat) {
                 /*
                  * MAYWRITE to allow gdb to COW and set breakpoints
@@ -358,11 +360,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
                         goto up_fail;
         }
  
-       current->mm->context.vdso = (void *)addr;
         current_thread_info()->sysenter_return =
                 VDSO32_SYMBOL(addr, SYSENTER_RETURN);
  
    up_fail:
+       if (ret)
+               current->mm->context.vdso = NULL;
+
         up_write(&mm->mmap_sem);
  
         return ret;
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c

index 7133cdf9098b7ef0366871a92c29ef8aa5445209..21e1aeb9f3ea1b1f839445dbae69ad461ddf188f 100644 (file)
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -8,6 +8,7 @@
  #include <linux/sched.h>
  #include <linux/init.h>
  #include <linux/random.h>
+#include <linux/elf.h>
  #include <asm/vsyscall.h>
  #include <asm/vgtod.h>
  #include <asm/proto.h>
@@ -115,15 +116,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
                 goto up_fail;
         }
  
+       current->mm->context.vdso = (void *)addr;
+
         ret = install_special_mapping(mm, addr, vdso_size,
                                       VM_READ|VM_EXEC|
                                       VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
                                       VM_ALWAYSDUMP,
                                       vdso_pages);
-       if (ret)
+       if (ret) {
+               current->mm->context.vdso = NULL;
                 goto up_fail;
+       }
  
-       current->mm->context.vdso = (void *)addr;
  up_fail:
         up_write(&mm->mmap_sem);
         return ret;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c

index f09e8c36ee805d58ba0d6580397305744446cc42..0a1700a2be9c8c9a548822ca4deef1bb475b4059 100644 (file)
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -20,6 +20,7 @@
  #include <linux/delay.h>
  #include <linux/start_kernel.h>
  #include <linux/sched.h>
+#include <linux/kprobes.h>
  #include <linux/bootmem.h>
  #include <linux/module.h>
  #include <linux/mm.h>
@@ -44,6 +45,7 @@
  #include <asm/processor.h>
  #include <asm/proto.h>
  #include <asm/msr-index.h>
+#include <asm/traps.h>
  #include <asm/setup.h>
  #include <asm/desc.h>
  #include <asm/pgtable.h>
@@ -240,10 +242,10 @@ static unsigned long xen_get_debugreg(int reg)
         return HYPERVISOR_get_debugreg(reg);
  }
  
-void xen_leave_lazy(void)
+static void xen_end_context_switch(struct task_struct *next)
  {
-       paravirt_leave_lazy(paravirt_get_lazy_mode());
         xen_mc_flush();
+       paravirt_end_context_switch(next);
  }
  
  static unsigned long xen_store_tr(void)
@@ -428,11 +430,44 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
  static int cvt_gate_to_trap(int vector, const gate_desc *val,
                             struct trap_info *info)
  {
+       unsigned long addr;
+
         if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
                 return 0;
  
         info->vector = vector;
-       info->address = gate_offset(*val);
+
+       addr = gate_offset(*val);
+#ifdef CONFIG_X86_64
+       /*
+        * Look for known traps using IST, and substitute them
+        * appropriately.  The debugger ones are the only ones we care
+        * about.  Xen will handle faults like double_fault and
+        * machine_check, so we should never see them.  Warn if
+        * there's an unexpected IST-using fault handler.
+        */
+       if (addr == (unsigned long)debug)
+               addr = (unsigned long)xen_debug;
+       else if (addr == (unsigned long)int3)
+               addr = (unsigned long)xen_int3;
+       else if (addr == (unsigned long)stack_segment)
+               addr = (unsigned long)xen_stack_segment;
+       else if (addr == (unsigned long)double_fault ||
+                addr == (unsigned long)nmi) {
+               /* Don't need to handle these */
+               return 0;
+#ifdef CONFIG_X86_MCE
+       } else if (addr == (unsigned long)machine_check) {
+               return 0;
+#endif
+       } else {
+               /* Some other trap using IST? */
+               if (WARN_ON(val->ist != 0))
+                       return 0;
+       }
+#endif /* CONFIG_X86_64 */
+       info->address = addr;
+
         info->cs = gate_segment(*val);
         info->flags = val->dpl;
         /* interrupt gates clear IF */
@@ -623,10 +658,26 @@ static void xen_clts(void)
         xen_mc_issue(PARAVIRT_LAZY_CPU);
  }
  
+static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
+
+static unsigned long xen_read_cr0(void)
+{
+       unsigned long cr0 = percpu_read(xen_cr0_value);
+
+       if (unlikely(cr0 == 0)) {
+               cr0 = native_read_cr0();
+               percpu_write(xen_cr0_value, cr0);
+       }
+
+       return cr0;
+}
+
  static void xen_write_cr0(unsigned long cr0)
  {
         struct multicall_space mcs;
  
+       percpu_write(xen_cr0_value, cr0);
+
         /* Only pay attention to cr0.TS; everything else is
            ignored. */
         mcs = xen_mc_entry(0);
@@ -812,7 +863,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
  
         .clts = xen_clts,
  
-       .read_cr0 = native_read_cr0,
+       .read_cr0 = xen_read_cr0,
         .write_cr0 = xen_write_cr0,
  
         .read_cr4 = native_read_cr4,
@@ -860,10 +911,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
         /* Xen takes care of %gs when switching to usermode for us */
         .swapgs = paravirt_nop,
  
-       .lazy_mode = {
-               .enter = paravirt_enter_lazy_cpu,
-               .leave = xen_leave_lazy,
-       },
+       .start_context_switch = paravirt_start_context_switch,
+       .end_context_switch = xen_end_context_switch,
  };
  
  static const struct pv_apic_ops xen_apic_ops __initdata = {
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c

index fba55b1a40217f93dad87242d3d7584a5d9665b2..4ceb28581652ef0ab7ff7bcc5426dd914954d100 100644 (file)
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -452,10 +452,6 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
  void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
                     pte_t *ptep, pte_t pteval)
  {
-       /* updates to init_mm may be done without lock */
-       if (mm == &init_mm)
-               preempt_disable();
-
         ADD_STATS(set_pte_at, 1);
  //     ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
         ADD_STATS(set_pte_at_current, mm == current->mm);
@@ -476,9 +472,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
         }
         xen_set_pte(ptep, pteval);
  
-out:
-       if (mm == &init_mm)
-               preempt_enable();
+out:   return;
  }
  
  pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
@@ -1152,10 +1146,8 @@ static void drop_other_mm_ref(void *info)
  
         /* If this cpu still has a stale cr3 reference, then make sure
            it has been flushed. */
-       if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
+       if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
                 load_cr3(swapper_pg_dir);
-               arch_flush_lazy_cpu_mode();
-       }
  }
  
  static void xen_drop_mm_ref(struct mm_struct *mm)
@@ -1168,7 +1160,6 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
                         load_cr3(swapper_pg_dir);
                 else
                         leave_mm(smp_processor_id());
-               arch_flush_lazy_cpu_mode();
         }
  
         /* Get the "official" set of cpus referring to our pagetable. */
@@ -1876,6 +1867,14 @@ __init void xen_post_allocator_init(void)
         xen_mark_init_mm_pinned();
  }
  
+static void xen_leave_lazy_mmu(void)
+{
+       preempt_disable();
+       xen_mc_flush();
+       paravirt_leave_lazy_mmu();
+       preempt_enable();
+}
+
  const struct pv_mmu_ops xen_mmu_ops __initdata = {
         .pagetable_setup_start = xen_pagetable_setup_start,
         .pagetable_setup_done = xen_pagetable_setup_done,
@@ -1949,7 +1948,7 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
  
         .lazy_mode = {
                 .enter = paravirt_enter_lazy_mmu,
-               .leave = xen_leave_lazy,
+               .leave = xen_leave_lazy_mmu,
         },
  
         .set_fixmap = xen_set_fixmap,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c

index 15c6c68db6a25f4941d25e8fc35857a3d518dc5d..ad0047f47cd476004c99485877f6d62a7db20d86 100644 (file)
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -61,9 +61,9 @@ char * __init xen_memory_setup(void)
          *  - xen_start_info
          * See comment above "struct start_info" in <xen/interface/xen.h>
          */
-       e820_add_region(__pa(xen_start_info->mfn_list),
-                       xen_start_info->pt_base - xen_start_info->mfn_list,
-                       E820_RESERVED);
+       reserve_early(__pa(xen_start_info->mfn_list),
+                     __pa(xen_start_info->pt_base),
+                       "XEN START INFO");
  
         sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
  
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h

index ca6596b05d533c25f56e242409e88471a816ba9c..22494fd4c9b5cf49f8b3af7b2999626dc750b80c 100644 (file)
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,7 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
  void xen_ident_map_ISA(void);
  void xen_reserve_top(void);
  
-void xen_leave_lazy(void);
  void xen_post_allocator_init(void);
  
  char * __init xen_memory_setup(void);
diff --git a/block/blk-core.c b/block/blk-core.c

index c89883be87379d9454ab1af7cd68319e92795597..9475bf99b891c20fb080c7508ed01046dbc21501 100644 (file)
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -28,22 +28,14 @@
  #include <linux/task_io_accounting_ops.h>
  #include <linux/blktrace_api.h>
  #include <linux/fault-inject.h>
-#include <trace/block.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/block.h>
  
  #include "blk.h"
  
-DEFINE_TRACE(block_plug);
-DEFINE_TRACE(block_unplug_io);
-DEFINE_TRACE(block_unplug_timer);
-DEFINE_TRACE(block_getrq);
-DEFINE_TRACE(block_sleeprq);
-DEFINE_TRACE(block_rq_requeue);
-DEFINE_TRACE(block_bio_backmerge);
-DEFINE_TRACE(block_bio_frontmerge);
-DEFINE_TRACE(block_bio_queue);
-DEFINE_TRACE(block_rq_complete);
-DEFINE_TRACE(block_remap);     /* Also used in drivers/md/dm.c */
  EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
  
  static int __make_request(struct request_queue *q, struct bio *bio);
  
@@ -1277,7 +1269,7 @@ static inline void blk_partition_remap(struct bio *bio)
                 bio->bi_bdev = bdev->bd_contains;
  
                 trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
-                                   bdev->bd_dev, bio->bi_sector,
+                                   bdev->bd_dev,
                                     bio->bi_sector - p->start_sect);
         }
  }
@@ -1446,8 +1438,7 @@ static inline void __generic_make_request(struct bio *bio)
                         goto end_io;
  
                 if (old_sector != -1)
-                       trace_block_remap(q, bio, old_dev, bio->bi_sector,
-                                           old_sector);
+                       trace_block_remap(q, bio, old_dev, old_sector);
  
                 trace_block_bio_queue(q, bio);
  
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c

index 3ff9bba3379a84891ddbc97450fcdbddf6e42a1a..26f9ec28f56c7f3b6d87a3ed207bf0722d9af572 100644 (file)
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -383,16 +383,21 @@ struct kobj_type blk_queue_ktype = {
  int blk_register_queue(struct gendisk *disk)
  {
         int ret;
+       struct device *dev = disk_to_dev(disk);
  
         struct request_queue *q = disk->queue;
  
         if (WARN_ON(!q))
                 return -ENXIO;
  
+       ret = blk_trace_init_sysfs(dev);
+       if (ret)
+               return ret;
+
         if (!q->request_fn)
                 return 0;
  
-       ret = kobject_add(&q->kobj, kobject_get(&disk_to_dev(disk)->kobj),
+       ret = kobject_add(&q->kobj, kobject_get(&dev->kobj),
                           "%s", "queue");
         if (ret < 0)
                 return ret;
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c

index f87615dea46bbd9dfcdf0789f170c2b9b8512f6a..f8c218cd08e193d206c8b40dc18750ac1bdaa804 100644 (file)
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -568,7 +568,7 @@ static int compat_blk_trace_setup(struct block_device *bdev, char __user *arg)
         memcpy(&buts.name, &cbuts.name, 32);
  
         mutex_lock(&bdev->bd_mutex);
-       ret = do_blk_trace_setup(q, b, bdev->bd_dev, &buts);
+       ret = do_blk_trace_setup(q, b, bdev->bd_dev, bdev, &buts);
         mutex_unlock(&bdev->bd_mutex);
         if (ret)
                 return ret;
diff --git a/block/elevator.c b/block/elevator.c

index 7073a9072577cdf3a0ae6e63c5ca247c2f493a5d..e220f0c543e3c94bcd8907df4e9ac676ce57c41b 100644 (file)
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -33,17 +33,16 @@
  #include <linux/compiler.h>
  #include <linux/delay.h>
  #include <linux/blktrace_api.h>
-#include <trace/block.h>
  #include <linux/hash.h>
  #include <linux/uaccess.h>
  
+#include <trace/events/block.h>
+
  #include "blk.h"
  
  static DEFINE_SPINLOCK(elv_list_lock);
  static LIST_HEAD(elv_list);
  
-DEFINE_TRACE(block_rq_abort);
-
  /*
   * Merge hash stuff.
   */
@@ -55,9 +54,6 @@ static const int elv_hash_shift = 6;
  #define rq_hash_key(rq)                ((rq)->sector + (rq)->nr_sectors)
  #define ELV_ON_HASH(rq)                (!hlist_unhashed(&(rq)->hash))
  
-DEFINE_TRACE(block_rq_insert);
-DEFINE_TRACE(block_rq_issue);
-
  /*
   * Query io scheduler to see if the current process issuing bio may be
   * merged with rq.
diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c

index 51b9f8280f887cfd6649a39446e7e081d52fe3b6..2faa9e2ac89331b9c46c1de0de560a0225ce88e2 100644 (file)
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -401,7 +401,8 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
                 /* Interrupt Line values above 0xF are forbidden */
                 if (dev->irq > 0 && (dev->irq <= 0xF)) {
                         printk(" - using IRQ %d\n", dev->irq);
-                       acpi_register_gsi(dev->irq, ACPI_LEVEL_SENSITIVE,
+                       acpi_register_gsi(&dev->dev, dev->irq,
+                                         ACPI_LEVEL_SENSITIVE,
                                           ACPI_ACTIVE_LOW);
                         return 0;
                 } else {
@@ -410,7 +411,7 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
                 }
         }
  
-       rc = acpi_register_gsi(gsi, triggering, polarity);
+       rc = acpi_register_gsi(&dev->dev, gsi, triggering, polarity);
         if (rc < 0) {
                 dev_warn(&dev->dev, "PCI INT %c: failed to register GSI\n",
                          pin_name(pin));
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c

index 340ba4f9dc54880bd6a3f7661ebba93d9c81733b..4a9f3492b9216142333f3fa2013542b4638b9c1e 100644 (file)
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -224,7 +224,7 @@ static void hpet_timer_set_irq(struct hpet_dev *devp)
                         break;
                 }
  
-               gsi = acpi_register_gsi(irq, ACPI_LEVEL_SENSITIVE,
+               gsi = acpi_register_gsi(NULL, irq, ACPI_LEVEL_SENSITIVE,
                                         ACPI_ACTIVE_LOW);
                 if (gsi > 0)
                         break;
@@ -939,7 +939,7 @@ static acpi_status hpet_resources(struct acpi_resource *res, void *data)
                 irqp = &res->data.extended_irq;
  
                 for (i = 0; i < irqp->interrupt_count; i++) {
-                       irq = acpi_register_gsi(irqp->interrupts[i],
+                       irq = acpi_register_gsi(NULL, irqp->interrupts[i],
                                       irqp->triggering, irqp->polarity);
                         if (irq < 0)
                                 return AE_ERROR;
diff --git a/drivers/char/mem.c b/drivers/char/mem.c

index 65e12bca657cfa0ba14f842ce6b442411f68bca6..f96d0bef855e3cda2507c698eb5cfdef85567c10 100644 (file)
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -694,9 +694,8 @@ static ssize_t read_zero(struct file * file, char __user * buf,
                 written += chunk - unwritten;
                 if (unwritten)
                         break;
-               /* Consider changing this to just 'signal_pending()' with lots of testing */
-               if (fatal_signal_pending(current))
-                       return written ? written : -EINTR;
+               if (signal_pending(current))
+                       return written ? written : -ERESTARTSYS;
                 buf += chunk;
                 count -= chunk;
                 cond_resched();
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c

index d6a807f4077d0453c3825d01fb1b3403f513aece..39a05b5fa9cb72215da7fc1dbf85678d2b91ee1c 100644 (file)
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -25,6 +25,7 @@
  #include <linux/kbd_kern.h>
  #include <linux/proc_fs.h>
  #include <linux/quotaops.h>
+#include <linux/perf_counter.h>
  #include <linux/kernel.h>
  #include <linux/module.h>
  #include <linux/suspend.h>
@@ -243,6 +244,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
         struct pt_regs *regs = get_irq_regs();
         if (regs)
                 show_regs(regs);
+       perf_counter_print_debug();
  }
  static struct sysrq_key_op sysrq_showregs_op = {
         .handler        = sysrq_handle_showregs,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c

index 424f7b048c304e8f2a4c7da7e325abbbd1859b53..3fd8b1e65483da53070f0fe633e82843ac1a5ba0 100644 (file)
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -20,7 +20,8 @@
  #include <linux/idr.h>
  #include <linux/hdreg.h>
  #include <linux/blktrace_api.h>
-#include <trace/block.h>
+
+#include <trace/events/block.h>
  
  #define DM_MSG_PREFIX "core"
  
@@ -53,8 +54,6 @@ struct dm_target_io {
         union map_info info;
  };
  
-DEFINE_TRACE(block_bio_complete);
-
  /*
   * For request-based dm.
   * One of these is allocated per request.
@@ -656,8 +655,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
                 /* the bio has been remapped so dispatch it */
  
                 trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
-                                   tio->io->bio->bi_bdev->bd_dev,
-                                   clone->bi_sector, sector);
+                                   tio->io->bio->bi_bdev->bd_dev, sector);
  
                 generic_make_request(clone);
         } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c

index 73348c4047e98d249a9a576b87f2e8d44edaf623..4a9cc92d4d1867469dcae3efed4c468b251ff7f8 100644 (file)
--- a/drivers/parisc/iosapic.c
+++ b/drivers/parisc/iosapic.c
@@ -702,7 +702,7 @@ static unsigned int iosapic_startup_irq(unsigned int irq)
  }
  
  #ifdef CONFIG_SMP
-static void iosapic_set_affinity_irq(unsigned int irq,
+static int iosapic_set_affinity_irq(unsigned int irq,
                                      const struct cpumask *dest)
  {
         struct vector_info *vi = iosapic_get_vector(irq);
@@ -712,7 +712,7 @@ static void iosapic_set_affinity_irq(unsigned int irq,
  
         dest_cpu = cpu_check_affinity(irq, dest);
         if (dest_cpu < 0)
-               return;
+               return -1;
  
         cpumask_copy(irq_desc[irq].affinity, cpumask_of(dest_cpu));
         vi->txn_addr = txn_affinity_addr(irq, dest_cpu);
@@ -724,6 +724,8 @@ static void iosapic_set_affinity_irq(unsigned int irq,
         iosapic_set_irt_data(vi, &dummy_d0, &d1);
         iosapic_wr_irt_entry(vi, d0, d1);
         spin_unlock_irqrestore(&iosapic_lock, flags);
+
+       return 0;
  }
  #endif
  
diff --git a/drivers/pci/hotplug/ibmphp_core.c b/drivers/pci/hotplug/ibmphp_core.c

index dd18f857dfb042d8d5802bfbbab57c617054b3f4..42e4260c3b12311e3793102b76ec136f5f94352e 100644 (file)
--- a/drivers/pci/hotplug/ibmphp_core.c
+++ b/drivers/pci/hotplug/ibmphp_core.c
@@ -153,45 +153,47 @@ int ibmphp_init_devno(struct slot **cur_slot)
                 return -1;
         }
         for (loop = 0; loop < len; loop++) {
-               if ((*cur_slot)->number == rtable->slots[loop].slot) {
-               if ((*cur_slot)->bus == rtable->slots[loop].bus) {
+               if ((*cur_slot)->number == rtable->slots[loop].slot &&
+                   (*cur_slot)->bus == rtable->slots[loop].bus) {
+                       struct io_apic_irq_attr irq_attr;
+
                         (*cur_slot)->device = PCI_SLOT(rtable->slots[loop].devfn);
                         for (i = 0; i < 4; i++)
                                 (*cur_slot)->irq[i] = IO_APIC_get_PCI_irq_vector((int) (*cur_slot)->bus,
-                                               (int) (*cur_slot)->device, i);
-
-                               debug("(*cur_slot)->irq[0] = %x\n",
-                                               (*cur_slot)->irq[0]);
-                               debug("(*cur_slot)->irq[1] = %x\n",
-                                               (*cur_slot)->irq[1]);
-                               debug("(*cur_slot)->irq[2] = %x\n",
-                                               (*cur_slot)->irq[2]);
-                               debug("(*cur_slot)->irq[3] = %x\n",
-                                               (*cur_slot)->irq[3]);
-
-                               debug("rtable->exlusive_irqs = %x\n",
+                                               (int) (*cur_slot)->device, i,
+                                               &irq_attr);
+
+                       debug("(*cur_slot)->irq[0] = %x\n",
+                                       (*cur_slot)->irq[0]);
+                       debug("(*cur_slot)->irq[1] = %x\n",
+                                       (*cur_slot)->irq[1]);
+                       debug("(*cur_slot)->irq[2] = %x\n",
+                                       (*cur_slot)->irq[2]);
+                       debug("(*cur_slot)->irq[3] = %x\n",
+                                       (*cur_slot)->irq[3]);
+
+                       debug("rtable->exlusive_irqs = %x\n",
                                         rtable->exclusive_irqs);
-                               debug("rtable->slots[loop].irq[0].bitmap = %x\n",
+                       debug("rtable->slots[loop].irq[0].bitmap = %x\n",
                                         rtable->slots[loop].irq[0].bitmap);
-                               debug("rtable->slots[loop].irq[1].bitmap = %x\n",
+                       debug("rtable->slots[loop].irq[1].bitmap = %x\n",
                                         rtable->slots[loop].irq[1].bitmap);
-                               debug("rtable->slots[loop].irq[2].bitmap = %x\n",
+                       debug("rtable->slots[loop].irq[2].bitmap = %x\n",
                                         rtable->slots[loop].irq[2].bitmap);
-                               debug("rtable->slots[loop].irq[3].bitmap = %x\n",
+                       debug("rtable->slots[loop].irq[3].bitmap = %x\n",
                                         rtable->slots[loop].irq[3].bitmap);
  
-                               debug("rtable->slots[loop].irq[0].link = %x\n",
+                       debug("rtable->slots[loop].irq[0].link = %x\n",
                                         rtable->slots[loop].irq[0].link);
-                               debug("rtable->slots[loop].irq[1].link = %x\n",
+                       debug("rtable->slots[loop].irq[1].link = %x\n",
                                         rtable->slots[loop].irq[1].link);
-                               debug("rtable->slots[loop].irq[2].link = %x\n",
+                       debug("rtable->slots[loop].irq[2].link = %x\n",
                                         rtable->slots[loop].irq[2].link);
-                               debug("rtable->slots[loop].irq[3].link = %x\n",
+                       debug("rtable->slots[loop].irq[3].link = %x\n",
                                         rtable->slots[loop].irq[3].link);
-                               debug("end of init_devno\n");
-                               kfree(rtable);
-                               return 0;
-                       }
+                       debug("end of init_devno\n");
+                       kfree(rtable);
+                       return 0;
                 }
         }
  
diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c

index 6808d8333ecc414390590f83122f4c6602b01cef..737a1c44b07af9a8ed81ed6e2191fce7934603fd 100644 (file)
--- a/drivers/pci/htirq.c
+++ b/drivers/pci/htirq.c
@@ -98,6 +98,7 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
         int max_irq;
         int pos;
         int irq;
+       int node;
  
         pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ);
         if (!pos)
@@ -125,7 +126,8 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
         cfg->msg.address_lo = 0xffffffff;
         cfg->msg.address_hi = 0xffffffff;
  
-       irq = create_irq();
+       node = dev_to_node(&dev->dev);
+       irq = create_irq_nr(0, node);
  
         if (irq <= 0) {
                 kfree(cfg);
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c

index a563fbe559d0505b90d1f736beb3a8e8bf197717..cd389162735f3f9ff03dd596d549134ae052de5a 100644 (file)
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -1972,15 +1972,6 @@ static int __init init_dmars(void)
                 }
         }
  
-#ifdef CONFIG_INTR_REMAP
-       if (!intr_remapping_enabled) {
-               ret = enable_intr_remapping(0);
-               if (ret)
-                       printk(KERN_ERR
-                              "IOMMU: enable interrupt remapping failed\n");
-       }
-#endif
-
         /*
          * For each rmrr
          *   for each dev attached to rmrr
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c

index f5e0ea724a6f53a12d26443fba84b01ab898f35d..3a0cb0bb05933bd72277df67af0cfe2254a7b6e4 100644 (file)
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -15,6 +15,14 @@ static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
  static int ir_ioapic_num;
  int intr_remapping_enabled;
  
+static int disable_intremap;
+static __init int setup_nointremap(char *str)
+{
+       disable_intremap = 1;
+       return 0;
+}
+early_param("nointremap", setup_nointremap);
+
  struct irq_2_iommu {
         struct intel_iommu *iommu;
         u16 irte_index;
@@ -23,15 +31,12 @@ struct irq_2_iommu {
  };
  
  #ifdef CONFIG_GENERIC_HARDIRQS
-static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
+static struct irq_2_iommu *get_one_free_irq_2_iommu(int node)
  {
         struct irq_2_iommu *iommu;
-       int node;
-
-       node = cpu_to_node(cpu);
  
         iommu = kzalloc_node(sizeof(*iommu), GFP_ATOMIC, node);
-       printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node);
+       printk(KERN_DEBUG "alloc irq_2_iommu on node %d\n", node);
  
         return iommu;
  }
@@ -48,7 +53,7 @@ static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
         return desc->irq_2_iommu;
  }
  
-static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
+static struct irq_2_iommu *irq_2_iommu_alloc_node(unsigned int irq, int node)
  {
         struct irq_desc *desc;
         struct irq_2_iommu *irq_iommu;
@@ -56,7 +61,7 @@ static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
         /*
          * alloc irq desc if not allocated already.
          */
-       desc = irq_to_desc_alloc_cpu(irq, cpu);
+       desc = irq_to_desc_alloc_node(irq, node);
         if (!desc) {
                 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
                 return NULL;
@@ -65,14 +70,14 @@ static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
         irq_iommu = desc->irq_2_iommu;
  
         if (!irq_iommu)
-               desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu);
+               desc->irq_2_iommu = get_one_free_irq_2_iommu(node);
  
         return desc->irq_2_iommu;
  }
  
  static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
  {
-       return irq_2_iommu_alloc_cpu(irq, boot_cpu_id);
+       return irq_2_iommu_alloc_node(irq, cpu_to_node(boot_cpu_id));
  }
  
  #else /* !CONFIG_SPARSE_IRQ */
@@ -423,20 +428,6 @@ static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
                       readl, (sts & DMA_GSTS_IRTPS), sts);
         spin_unlock_irqrestore(&iommu->register_lock, flags);
  
-       if (mode == 0) {
-               spin_lock_irqsave(&iommu->register_lock, flags);
-
-               /* enable comaptiblity format interrupt pass through */
-               cmd = iommu->gcmd | DMA_GCMD_CFI;
-               iommu->gcmd |= DMA_GCMD_CFI;
-               writel(cmd, iommu->reg + DMAR_GCMD_REG);
-
-               IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-                             readl, (sts & DMA_GSTS_CFIS), sts);
-
-               spin_unlock_irqrestore(&iommu->register_lock, flags);
-       }
-
         /*
          * global invalidation of interrupt entry cache before enabling
          * interrupt-remapping.
@@ -516,6 +507,23 @@ end:
         spin_unlock_irqrestore(&iommu->register_lock, flags);
  }
  
+int __init intr_remapping_supported(void)
+{
+       struct dmar_drhd_unit *drhd;
+
+       if (disable_intremap)
+               return 0;
+
+       for_each_drhd_unit(drhd) {
+               struct intel_iommu *iommu = drhd->iommu;
+
+               if (!ecap_ir_support(iommu->ecap))
+                       return 0;
+       }
+
+       return 1;
+}
+
  int __init enable_intr_remapping(int eim)
  {
         struct dmar_drhd_unit *drhd;
diff --git a/drivers/pnp/pnpacpi/rsparser.c b/drivers/pnp/pnpacpi/rsparser.c

index adf17856bacc187a8216af9ded5ec3cf2ed2662b..7f207f335beca2c53d8120c3a4681ff139cb7282 100644 (file)
--- a/drivers/pnp/pnpacpi/rsparser.c
+++ b/drivers/pnp/pnpacpi/rsparser.c
@@ -123,7 +123,7 @@ static void pnpacpi_parse_allocated_irqresource(struct pnp_dev *dev,
         }
  
         flags = irq_flags(triggering, polarity, shareable);
-       irq = acpi_register_gsi(gsi, triggering, polarity);
+       irq = acpi_register_gsi(&dev->dev, gsi, triggering, polarity);
         if (irq >= 0)
                 pcibios_penalize_isa_irq(irq, 1);
         else
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c

index e1716f14cd4710cca0363eab985cdf370d9b9879..91e316fe6522f00a837a580808b1c9164f067748 100644 (file)
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1065,6 +1065,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
                 return blk_trace_setup(sdp->device->request_queue,
                                        sdp->disk->disk_name,
                                        MKDEV(SCSI_GENERIC_MAJOR, sdp->index),
+                                      NULL,
                                        (char *)arg);
         case BLKTRACESTART:
                 return blk_trace_startstop(sdp->device->request_queue, 1);
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig

index 8ac9cddac5754fdbf8ef9df33550a47bfba318f4..cab100acf983905b53d5c85b6ae76496d7b9b7c8 100644 (file)
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -18,6 +18,16 @@ config XEN_SCRUB_PAGES
           secure, but slightly less efficient.
           If in doubt, say yes.
  
+config XEN_DEV_EVTCHN
+       tristate "Xen /dev/xen/evtchn device"
+       depends on XEN
+       default y
+       help
+         The evtchn driver allows a userspace process to triger event
+         channels and to receive notification of an event channel
+         firing.
+         If in doubt, say yes.
+
  config XENFS
         tristate "Xen filesystem"
         depends on XEN
@@ -41,3 +51,13 @@ config XEN_COMPAT_XENFS
           a xen platform.
           If in doubt, say yes.
  
+config XEN_SYS_HYPERVISOR
+       bool "Create xen entries under /sys/hypervisor"
+       depends on XEN && SYSFS
+       select SYS_HYPERVISOR
+       default y
+       help
+         Create entries under /sys/hypervisor describing the Xen
+        hypervisor environment.  When running native or in another
+        virtual environment, /sys/hypervisor will still be present,
+        but will have no xen contents.
+\ No newline at end of file
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile

index ff8accc9e103f9d2a4fc21dcbcc2e88d479a0ade..ec2a39b1e26f0438f6a2266cd77ff6173fd7bbc7 100644 (file)
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -4,4 +4,6 @@ obj-y   += xenbus/
  obj-$(CONFIG_HOTPLUG_CPU)      += cpu_hotplug.o
  obj-$(CONFIG_XEN_XENCOMM)      += xencomm.o
  obj-$(CONFIG_XEN_BALLOON)      += balloon.o
-obj-$(CONFIG_XENFS)            += xenfs/
-\ No newline at end of file
+obj-$(CONFIG_XEN_DEV_EVTCHN)   += evtchn.o
+obj-$(CONFIG_XENFS)            += xenfs/
+obj-$(CONFIG_XEN_SYS_HYPERVISOR)       += sys-hypervisor.o
+\ No newline at end of file
diff --git a/drivers/xen/events.c b/drivers/xen/events.c

index 30963af5dba02960143f9ebf37b6f85d44de0dec..891d2e90753ab6e9dd312b3120ce6904a96058c2 100644 (file)
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -151,6 +151,12 @@ static unsigned int evtchn_from_irq(unsigned irq)
         return info_for_irq(irq)->evtchn;
  }
  
+unsigned irq_from_evtchn(unsigned int evtchn)
+{
+       return evtchn_to_irq[evtchn];
+}
+EXPORT_SYMBOL_GPL(irq_from_evtchn);
+
  static enum ipi_vector ipi_from_irq(unsigned irq)
  {
         struct irq_info *info = info_for_irq(irq);
@@ -335,7 +341,7 @@ static int find_unbound_irq(void)
         if (irq == nr_irqs)
                 panic("No available IRQ to bind to: increase nr_irqs!\n");
  
-       desc = irq_to_desc_alloc_cpu(irq, 0);
+       desc = irq_to_desc_alloc_node(irq, 0);
         if (WARN_ON(desc == NULL))
                 return -1;
  
@@ -688,13 +694,13 @@ void rebind_evtchn_irq(int evtchn, int irq)
  }
  
  /* Rebind an evtchn so that it gets delivered to a specific cpu */
-static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
  {
         struct evtchn_bind_vcpu bind_vcpu;
         int evtchn = evtchn_from_irq(irq);
  
         if (!VALID_EVTCHN(evtchn))
-               return;
+               return -1;
  
         /* Send future instances of this interrupt to other vcpu. */
         bind_vcpu.port = evtchn;
@@ -707,13 +713,15 @@ static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
          */
         if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
                 bind_evtchn_to_cpu(evtchn, tcpu);
-}
  
+       return 0;
+}
  
-static void set_affinity_irq(unsigned irq, const struct cpumask *dest)
+static int set_affinity_irq(unsigned irq, const struct cpumask *dest)
  {
         unsigned tcpu = cpumask_first(dest);
-       rebind_irq_to_cpu(irq, tcpu);
+
+       return rebind_irq_to_cpu(irq, tcpu);
  }
  
  int resend_irq_on_evtchn(unsigned int irq)
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c

new file mode 100644 (file)

index 0000000..af03195
--- /dev/null
+++ b/drivers/xen/evtchn.c
@@ -0,0 +1,507 @@
+/******************************************************************************
+ * evtchn.c
+ *
+ * Driver for receiving and demuxing event-channel signals.
+ *
+ * Copyright (c) 2004-2005, K A Fraser
+ * Multi-process extensions Copyright (c) 2004, Steven Smith
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/errno.h>
+#include <linux/miscdevice.h>
+#include <linux/major.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/poll.h>
+#include <linux/irq.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <xen/events.h>
+#include <xen/evtchn.h>
+#include <asm/xen/hypervisor.h>
+
+struct per_user_data {
+       struct mutex bind_mutex; /* serialize bind/unbind operations */
+
+       /* Notification ring, accessed via /dev/xen/evtchn. */
+#define EVTCHN_RING_SIZE     (PAGE_SIZE / sizeof(evtchn_port_t))
+#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1))
+       evtchn_port_t *ring;
+       unsigned int ring_cons, ring_prod, ring_overflow;
+       struct mutex ring_cons_mutex; /* protect against concurrent readers */
+
+       /* Processes wait on this queue when ring is empty. */
+       wait_queue_head_t evtchn_wait;
+       struct fasync_struct *evtchn_async_queue;
+       const char *name;
+};
+
+/* Who's bound to each port? */
+static struct per_user_data *port_user[NR_EVENT_CHANNELS];
+static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */
+
+irqreturn_t evtchn_interrupt(int irq, void *data)
+{
+       unsigned int port = (unsigned long)data;
+       struct per_user_data *u;
+
+       spin_lock(&port_user_lock);
+
+       u = port_user[port];
+
+       disable_irq_nosync(irq);
+
+       if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
+               u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
+               wmb(); /* Ensure ring contents visible */
+               if (u->ring_cons == u->ring_prod++) {
+                       wake_up_interruptible(&u->evtchn_wait);
+                       kill_fasync(&u->evtchn_async_queue,
+                                   SIGIO, POLL_IN);
+               }
+       } else {
+               u->ring_overflow = 1;
+       }
+
+       spin_unlock(&port_user_lock);
+
+       return IRQ_HANDLED;
+}
+
+static ssize_t evtchn_read(struct file *file, char __user *buf,
+                          size_t count, loff_t *ppos)
+{
+       int rc;
+       unsigned int c, p, bytes1 = 0, bytes2 = 0;
+       struct per_user_data *u = file->private_data;
+
+       /* Whole number of ports. */
+       count &= ~(sizeof(evtchn_port_t)-1);
+
+       if (count == 0)
+               return 0;
+
+       if (count > PAGE_SIZE)
+               count = PAGE_SIZE;
+
+       for (;;) {
+               mutex_lock(&u->ring_cons_mutex);
+
+               rc = -EFBIG;
+               if (u->ring_overflow)
+                       goto unlock_out;
+
+               c = u->ring_cons;
+               p = u->ring_prod;
+               if (c != p)
+                       break;
+
+               mutex_unlock(&u->ring_cons_mutex);
+
+               if (file->f_flags & O_NONBLOCK)
+                       return -EAGAIN;
+
+               rc = wait_event_interruptible(u->evtchn_wait,
+                                             u->ring_cons != u->ring_prod);
+               if (rc)
+                       return rc;
+       }
+
+       /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */
+       if (((c ^ p) & EVTCHN_RING_SIZE) != 0) {
+               bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) *
+                       sizeof(evtchn_port_t);
+               bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t);
+       } else {
+               bytes1 = (p - c) * sizeof(evtchn_port_t);
+               bytes2 = 0;
+       }
+
+       /* Truncate chunks according to caller's maximum byte count. */
+       if (bytes1 > count) {
+               bytes1 = count;
+               bytes2 = 0;
+       } else if ((bytes1 + bytes2) > count) {
+               bytes2 = count - bytes1;
+       }
+
+       rc = -EFAULT;
+       rmb(); /* Ensure that we see the port before we copy it. */
+       if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) ||
+           ((bytes2 != 0) &&
+            copy_to_user(&buf[bytes1], &u->ring[0], bytes2)))
+               goto unlock_out;
+
+       u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t);
+       rc = bytes1 + bytes2;
+
+ unlock_out:
+       mutex_unlock(&u->ring_cons_mutex);
+       return rc;
+}
+
+static ssize_t evtchn_write(struct file *file, const char __user *buf,
+                           size_t count, loff_t *ppos)
+{
+       int rc, i;
+       evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
+       struct per_user_data *u = file->private_data;
+
+       if (kbuf == NULL)
+               return -ENOMEM;
+
+       /* Whole number of ports. */
+       count &= ~(sizeof(evtchn_port_t)-1);
+
+       rc = 0;
+       if (count == 0)
+               goto out;
+
+       if (count > PAGE_SIZE)
+               count = PAGE_SIZE;
+
+       rc = -EFAULT;
+       if (copy_from_user(kbuf, buf, count) != 0)
+               goto out;
+
+       spin_lock_irq(&port_user_lock);
+       for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
+               if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
+                       enable_irq(irq_from_evtchn(kbuf[i]));
+       spin_unlock_irq(&port_user_lock);
+
+       rc = count;
+
+ out:
+       free_page((unsigned long)kbuf);
+       return rc;
+}
+
+static int evtchn_bind_to_user(struct per_user_data *u, int port)
+{
+       int rc = 0;
+
+       /*
+        * Ports are never reused, so every caller should pass in a
+        * unique port.
+        *
+        * (Locking not necessary because we haven't registered the
+        * interrupt handler yet, and our caller has already
+        * serialized bind operations.)
+        */
+       BUG_ON(port_user[port] != NULL);
+       port_user[port] = u;
+
+       rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
+                                      u->name, (void *)(unsigned long)port);
+       if (rc >= 0)
+               rc = 0;
+
+       return rc;
+}
+
+static void evtchn_unbind_from_user(struct per_user_data *u, int port)
+{
+       int irq = irq_from_evtchn(port);
+
+       unbind_from_irqhandler(irq, (void *)(unsigned long)port);
+
+       /* make sure we unbind the irq handler before clearing the port */
+       barrier();
+
+       port_user[port] = NULL;
+}
+
+static long evtchn_ioctl(struct file *file,
+                        unsigned int cmd, unsigned long arg)
+{
+       int rc;
+       struct per_user_data *u = file->private_data;
+       void __user *uarg = (void __user *) arg;
+
+       /* Prevent bind from racing with unbind */
+       mutex_lock(&u->bind_mutex);
+
+       switch (cmd) {
+       case IOCTL_EVTCHN_BIND_VIRQ: {
+               struct ioctl_evtchn_bind_virq bind;
+               struct evtchn_bind_virq bind_virq;
+
+               rc = -EFAULT;
+               if (copy_from_user(&bind, uarg, sizeof(bind)))
+                       break;
+
+               bind_virq.virq = bind.virq;
+               bind_virq.vcpu = 0;
+               rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+                                                &bind_virq);
+               if (rc != 0)
+                       break;
+
+               rc = evtchn_bind_to_user(u, bind_virq.port);
+               if (rc == 0)
+                       rc = bind_virq.port;
+               break;
+       }
+
+       case IOCTL_EVTCHN_BIND_INTERDOMAIN: {
+               struct ioctl_evtchn_bind_interdomain bind;
+               struct evtchn_bind_interdomain bind_interdomain;
+
+               rc = -EFAULT;
+               if (copy_from_user(&bind, uarg, sizeof(bind)))
+                       break;
+
+               bind_interdomain.remote_dom  = bind.remote_domain;
+               bind_interdomain.remote_port = bind.remote_port;
+               rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+                                                &bind_interdomain);
+               if (rc != 0)
+                       break;
+
+               rc = evtchn_bind_to_user(u, bind_interdomain.local_port);
+               if (rc == 0)
+                       rc = bind_interdomain.local_port;
+               break;
+       }
+
+       case IOCTL_EVTCHN_BIND_UNBOUND_PORT: {
+               struct ioctl_evtchn_bind_unbound_port bind;
+               struct evtchn_alloc_unbound alloc_unbound;
+
+               rc = -EFAULT;
+               if (copy_from_user(&bind, uarg, sizeof(bind)))
+                       break;
+
+               alloc_unbound.dom        = DOMID_SELF;
+               alloc_unbound.remote_dom = bind.remote_domain;
+               rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+                                                &alloc_unbound);
+               if (rc != 0)
+                       break;
+
+               rc = evtchn_bind_to_user(u, alloc_unbound.port);
+               if (rc == 0)
+                       rc = alloc_unbound.port;
+               break;
+       }
+
+       case IOCTL_EVTCHN_UNBIND: {
+               struct ioctl_evtchn_unbind unbind;
+
+               rc = -EFAULT;
+               if (copy_from_user(&unbind, uarg, sizeof(unbind)))
+                       break;
+
+               rc = -EINVAL;
+               if (unbind.port >= NR_EVENT_CHANNELS)
+                       break;
+
+               spin_lock_irq(&port_user_lock);
+
+               rc = -ENOTCONN;
+               if (port_user[unbind.port] != u) {
+                       spin_unlock_irq(&port_user_lock);
+                       break;
+               }
+
+               evtchn_unbind_from_user(u, unbind.port);
+
+               spin_unlock_irq(&port_user_lock);
+
+               rc = 0;
+               break;
+       }
+
+       case IOCTL_EVTCHN_NOTIFY: {
+               struct ioctl_evtchn_notify notify;
+
+               rc = -EFAULT;
+               if (copy_from_user(&notify, uarg, sizeof(notify)))
+                       break;
+
+               if (notify.port >= NR_EVENT_CHANNELS) {
+                       rc = -EINVAL;
+               } else if (port_user[notify.port] != u) {
+                       rc = -ENOTCONN;
+               } else {
+                       notify_remote_via_evtchn(notify.port);
+                       rc = 0;
+               }
+               break;
+       }
+
+       case IOCTL_EVTCHN_RESET: {
+               /* Initialise the ring to empty. Clear errors. */
+               mutex_lock(&u->ring_cons_mutex);
+               spin_lock_irq(&port_user_lock);
+               u->ring_cons = u->ring_prod = u->ring_overflow = 0;
+               spin_unlock_irq(&port_user_lock);
+               mutex_unlock(&u->ring_cons_mutex);
+               rc = 0;
+               break;
+       }
+
+       default:
+               rc = -ENOSYS;
+               break;
+       }
+       mutex_unlock(&u->bind_mutex);
+
+       return rc;
+}
+
+static unsigned int evtchn_poll(struct file *file, poll_table *wait)
+{
+       unsigned int mask = POLLOUT | POLLWRNORM;
+       struct per_user_data *u = file->private_data;
+
+       poll_wait(file, &u->evtchn_wait, wait);
+       if (u->ring_cons != u->ring_prod)
+               mask |= POLLIN | POLLRDNORM;
+       if (u->ring_overflow)
+               mask = POLLERR;
+       return mask;
+}
+
+static int evtchn_fasync(int fd, struct file *filp, int on)
+{
+       struct per_user_data *u = filp->private_data;
+       return fasync_helper(fd, filp, on, &u->evtchn_async_queue);
+}
+
+static int evtchn_open(struct inode *inode, struct file *filp)
+{
+       struct per_user_data *u;
+
+       u = kzalloc(sizeof(*u), GFP_KERNEL);
+       if (u == NULL)
+               return -ENOMEM;
+
+       u->name = kasprintf(GFP_KERNEL, "evtchn:%s", current->comm);
+       if (u->name == NULL) {
+               kfree(u);
+               return -ENOMEM;
+       }
+
+       init_waitqueue_head(&u->evtchn_wait);
+
+       u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
+       if (u->ring == NULL) {
+               kfree(u->name);
+               kfree(u);
+               return -ENOMEM;
+       }
+
+       mutex_init(&u->bind_mutex);
+       mutex_init(&u->ring_cons_mutex);
+
+       filp->private_data = u;
+
+       return 0;
+}
+
+static int evtchn_release(struct inode *inode, struct file *filp)
+{
+       int i;
+       struct per_user_data *u = filp->private_data;
+
+       spin_lock_irq(&port_user_lock);
+
+       free_page((unsigned long)u->ring);
+
+       for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+               if (port_user[i] != u)
+                       continue;
+
+               evtchn_unbind_from_user(port_user[i], i);
+       }
+
+       spin_unlock_irq(&port_user_lock);
+
+       kfree(u->name);
+       kfree(u);
+
+       return 0;
+}
+
+static const struct file_operations evtchn_fops = {
+       .owner   = THIS_MODULE,
+       .read    = evtchn_read,
+       .write   = evtchn_write,
+       .unlocked_ioctl = evtchn_ioctl,
+       .poll    = evtchn_poll,
+       .fasync  = evtchn_fasync,
+       .open    = evtchn_open,
+       .release = evtchn_release,
+};
+
+static struct miscdevice evtchn_miscdev = {
+       .minor        = MISC_DYNAMIC_MINOR,
+       .name         = "evtchn",
+       .fops         = &evtchn_fops,
+};
+static int __init evtchn_init(void)
+{
+       int err;
+
+       if (!xen_domain())
+               return -ENODEV;
+
+       spin_lock_init(&port_user_lock);
+       memset(port_user, 0, sizeof(port_user));
+
+       /* Create '/dev/misc/evtchn'. */
+       err = misc_register(&evtchn_miscdev);
+       if (err != 0) {
+               printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
+               return err;
+       }
+
+       printk(KERN_INFO "Event-channel device installed.\n");
+
+       return 0;
+}
+
+static void __exit evtchn_cleanup(void)
+{
+       misc_deregister(&evtchn_miscdev);
+}
+
+module_init(evtchn_init);
+module_exit(evtchn_cleanup);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c

index 4b5b84837ee13c85fab14a73c7ab213f74af90e2..fddc2025dece777c513a7a7a72f8537751bf1834 100644 (file)
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -98,9 +98,8 @@ static void do_suspend(void)
                 goto out;
         }
  
-       printk("suspending xenbus...\n");
-       /* XXX use normal device tree? */
-       xenbus_suspend();
+       printk(KERN_DEBUG "suspending xenstore...\n");
+       xs_suspend();
  
         err = device_power_down(PMSG_SUSPEND);
         if (err) {
@@ -116,9 +115,9 @@ static void do_suspend(void)
  
         if (!cancelled) {
                 xen_arch_resume();
-               xenbus_resume();
+               xs_resume();
         } else
-               xenbus_suspend_cancel();
+               xs_suspend_cancel();
  
         device_power_up(PMSG_RESUME);
  
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c

new file mode 100644 (file)

index 0000000..88a60e0
--- /dev/null
+++ b/drivers/xen/sys-hypervisor.c
@@ -0,0 +1,445 @@
+/*
+ *  copyright (c) 2006 IBM Corporation
+ *  Authored by: Mike D. Day <ncmike@us.ibm.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xenbus.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/version.h>
+
+#define HYPERVISOR_ATTR_RO(_name) \
+static struct hyp_sysfs_attr  _name##_attr = __ATTR_RO(_name)
+
+#define HYPERVISOR_ATTR_RW(_name) \
+static struct hyp_sysfs_attr _name##_attr = \
+       __ATTR(_name, 0644, _name##_show, _name##_store)
+
+struct hyp_sysfs_attr {
+       struct attribute attr;
+       ssize_t (*show)(struct hyp_sysfs_attr *, char *);
+       ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
+       void *hyp_attr_data;
+};
+
+static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       return sprintf(buffer, "xen\n");
+}
+
+HYPERVISOR_ATTR_RO(type);
+
+static int __init xen_sysfs_type_init(void)
+{
+       return sysfs_create_file(hypervisor_kobj, &type_attr.attr);
+}
+
+static void xen_sysfs_type_destroy(void)
+{
+       sysfs_remove_file(hypervisor_kobj, &type_attr.attr);
+}
+
+/* xen version attributes */
+static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int version = HYPERVISOR_xen_version(XENVER_version, NULL);
+       if (version)
+               return sprintf(buffer, "%d\n", version >> 16);
+       return -ENODEV;
+}
+
+HYPERVISOR_ATTR_RO(major);
+
+static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int version = HYPERVISOR_xen_version(XENVER_version, NULL);
+       if (version)
+               return sprintf(buffer, "%d\n", version & 0xff);
+       return -ENODEV;
+}
+
+HYPERVISOR_ATTR_RO(minor);
+
+static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret = -ENOMEM;
+       char *extra;
+
+       extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL);
+       if (extra) {
+               ret = HYPERVISOR_xen_version(XENVER_extraversion, extra);
+               if (!ret)
+                       ret = sprintf(buffer, "%s\n", extra);
+               kfree(extra);
+       }
+
+       return ret;
+}
+
+HYPERVISOR_ATTR_RO(extra);
+
+static struct attribute *version_attrs[] = {
+       &major_attr.attr,
+       &minor_attr.attr,
+       &extra_attr.attr,
+       NULL
+};
+
+static struct attribute_group version_group = {
+       .name = "version",
+       .attrs = version_attrs,
+};
+
+static int __init xen_sysfs_version_init(void)
+{
+       return sysfs_create_group(hypervisor_kobj, &version_group);
+}
+
+static void xen_sysfs_version_destroy(void)
+{
+       sysfs_remove_group(hypervisor_kobj, &version_group);
+}
+
+/* UUID */
+
+static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       char *vm, *val;
+       int ret;
+       extern int xenstored_ready;
+
+       if (!xenstored_ready)
+               return -EBUSY;
+
+       vm = xenbus_read(XBT_NIL, "vm", "", NULL);
+       if (IS_ERR(vm))
+               return PTR_ERR(vm);
+       val = xenbus_read(XBT_NIL, vm, "uuid", NULL);
+       kfree(vm);
+       if (IS_ERR(val))
+               return PTR_ERR(val);
+       ret = sprintf(buffer, "%s\n", val);
+       kfree(val);
+       return ret;
+}
+
+HYPERVISOR_ATTR_RO(uuid);
+
+static int __init xen_sysfs_uuid_init(void)
+{
+       return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr);
+}
+
+static void xen_sysfs_uuid_destroy(void)
+{
+       sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr);
+}
+
+/* xen compilation attributes */
+
+static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret = -ENOMEM;
+       struct xen_compile_info *info;
+
+       info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+       if (info) {
+               ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+               if (!ret)
+                       ret = sprintf(buffer, "%s\n", info->compiler);
+               kfree(info);
+       }
+
+       return ret;
+}
+
+HYPERVISOR_ATTR_RO(compiler);
+
+static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret = -ENOMEM;
+       struct xen_compile_info *info;
+
+       info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+       if (info) {
+               ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+               if (!ret)
+                       ret = sprintf(buffer, "%s\n", info->compile_by);
+               kfree(info);
+       }
+
+       return ret;
+}
+
+HYPERVISOR_ATTR_RO(compiled_by);
+
+static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret = -ENOMEM;
+       struct xen_compile_info *info;
+
+       info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+       if (info) {
+               ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+               if (!ret)
+                       ret = sprintf(buffer, "%s\n", info->compile_date);
+               kfree(info);
+       }
+
+       return ret;
+}
+
+HYPERVISOR_ATTR_RO(compile_date);
+
+static struct attribute *xen_compile_attrs[] = {
+       &compiler_attr.attr,
+       &compiled_by_attr.attr,
+       &compile_date_attr.attr,
+       NULL
+};
+
+static struct attribute_group xen_compilation_group = {
+       .name = "compilation",
+       .attrs = xen_compile_attrs,
+};
+
+int __init static xen_compilation_init(void)
+{
+       return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
+}
+
+static void xen_compilation_destroy(void)
+{
+       sysfs_remove_group(hypervisor_kobj, &xen_compilation_group);
+}
+
+/* xen properties info */
+
+static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret = -ENOMEM;
+       char *caps;
+
+       caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL);
+       if (caps) {
+               ret = HYPERVISOR_xen_version(XENVER_capabilities, caps);
+               if (!ret)
+                       ret = sprintf(buffer, "%s\n", caps);
+               kfree(caps);
+       }
+
+       return ret;
+}
+
+HYPERVISOR_ATTR_RO(capabilities);
+
+static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret = -ENOMEM;
+       char *cset;
+
+       cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL);
+       if (cset) {
+               ret = HYPERVISOR_xen_version(XENVER_changeset, cset);
+               if (!ret)
+                       ret = sprintf(buffer, "%s\n", cset);
+               kfree(cset);
+       }
+
+       return ret;
+}
+
+HYPERVISOR_ATTR_RO(changeset);
+
+static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret = -ENOMEM;
+       struct xen_platform_parameters *parms;
+
+       parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL);
+       if (parms) {
+               ret = HYPERVISOR_xen_version(XENVER_platform_parameters,
+                                            parms);
+               if (!ret)
+                       ret = sprintf(buffer, "%lx\n", parms->virt_start);
+               kfree(parms);
+       }
+
+       return ret;
+}
+
+HYPERVISOR_ATTR_RO(virtual_start);
+
+static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret;
+
+       ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL);
+       if (ret > 0)
+               ret = sprintf(buffer, "%x\n", ret);
+
+       return ret;
+}
+
+HYPERVISOR_ATTR_RO(pagesize);
+
+static ssize_t xen_feature_show(int index, char *buffer)
+{
+       ssize_t ret;
+       struct xen_feature_info info;
+
+       info.submap_idx = index;
+       ret = HYPERVISOR_xen_version(XENVER_get_features, &info);
+       if (!ret)
+               ret = sprintf(buffer, "%08x", info.submap);
+
+       return ret;
+}
+
+static ssize_t features_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       ssize_t len;
+       int i;
+
+       len = 0;
+       for (i = XENFEAT_NR_SUBMAPS-1; i >= 0; i--) {
+               int ret = xen_feature_show(i, buffer + len);
+               if (ret < 0) {
+                       if (len == 0)
+                               len = ret;
+                       break;
+               }
+               len += ret;
+       }
+       if (len > 0)
+               buffer[len++] = '\n';
+
+       return len;
+}
+
+HYPERVISOR_ATTR_RO(features);
+
+static struct attribute *xen_properties_attrs[] = {
+       &capabilities_attr.attr,
+       &changeset_attr.attr,
+       &virtual_start_attr.attr,
+       &pagesize_attr.attr,
+       &features_attr.attr,
+       NULL
+};
+
+static struct attribute_group xen_properties_group = {
+       .name = "properties",
+       .attrs = xen_properties_attrs,
+};
+
+static int __init xen_properties_init(void)
+{
+       return sysfs_create_group(hypervisor_kobj, &xen_properties_group);
+}
+
+static void xen_properties_destroy(void)
+{
+       sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
+}
+
+static int __init hyper_sysfs_init(void)
+{
+       int ret;
+
+       if (!xen_domain())
+               return -ENODEV;
+
+       ret = xen_sysfs_type_init();
+       if (ret)
+               goto out;
+       ret = xen_sysfs_version_init();
+       if (ret)
+               goto version_out;
+       ret = xen_compilation_init();
+       if (ret)
+               goto comp_out;
+       ret = xen_sysfs_uuid_init();
+       if (ret)
+               goto uuid_out;
+       ret = xen_properties_init();
+       if (ret)
+               goto prop_out;
+
+       goto out;
+
+prop_out:
+       xen_sysfs_uuid_destroy();
+uuid_out:
+       xen_compilation_destroy();
+comp_out:
+       xen_sysfs_version_destroy();
+version_out:
+       xen_sysfs_type_destroy();
+out:
+       return ret;
+}
+
+static void __exit hyper_sysfs_exit(void)
+{
+       xen_properties_destroy();
+       xen_compilation_destroy();
+       xen_sysfs_uuid_destroy();
+       xen_sysfs_version_destroy();
+       xen_sysfs_type_destroy();
+
+}
+module_init(hyper_sysfs_init);
+module_exit(hyper_sysfs_exit);
+
+static ssize_t hyp_sysfs_show(struct kobject *kobj,
+                             struct attribute *attr,
+                             char *buffer)
+{
+       struct hyp_sysfs_attr *hyp_attr;
+       hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
+       if (hyp_attr->show)
+               return hyp_attr->show(hyp_attr, buffer);
+       return 0;
+}
+
+static ssize_t hyp_sysfs_store(struct kobject *kobj,
+                              struct attribute *attr,
+                              const char *buffer,
+                              size_t len)
+{
+       struct hyp_sysfs_attr *hyp_attr;
+       hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
+       if (hyp_attr->store)
+               return hyp_attr->store(hyp_attr, buffer, len);
+       return 0;
+}
+
+static struct sysfs_ops hyp_sysfs_ops = {
+       .show = hyp_sysfs_show,
+       .store = hyp_sysfs_store,
+};
+
+static struct kobj_type hyp_sysfs_kobj_type = {
+       .sysfs_ops = &hyp_sysfs_ops,
+};
+
+static int __init hypervisor_subsys_init(void)
+{
+       if (!xen_domain())
+               return -ENODEV;
+
+       hypervisor_kobj->ktype = &hyp_sysfs_kobj_type;
+       return 0;
+}
+device_initcall(hypervisor_subsys_init);
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c

index 773d1cf2328334b9c62cc52fdf90c7107df02778..d42e25d5968dcf48acd5448a7a6af56d21abd5fb 100644 (file)
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -71,6 +71,9 @@ static int xenbus_probe_frontend(const char *type, const char *name);
  
  static void xenbus_dev_shutdown(struct device *_dev);
  
+static int xenbus_dev_suspend(struct device *dev, pm_message_t state);
+static int xenbus_dev_resume(struct device *dev);
+
  /* If something in array of ids matches this device, return it. */
  static const struct xenbus_device_id *
  match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
@@ -188,6 +191,9 @@ static struct xen_bus_type xenbus_frontend = {
                 .remove    = xenbus_dev_remove,
                 .shutdown  = xenbus_dev_shutdown,
                 .dev_attrs = xenbus_dev_attrs,
+
+               .suspend   = xenbus_dev_suspend,
+               .resume    = xenbus_dev_resume,
         },
  };
  
@@ -654,6 +660,7 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
  
         kfree(root);
  }
+EXPORT_SYMBOL_GPL(xenbus_dev_changed);
  
  static void frontend_changed(struct xenbus_watch *watch,
                              const char **vec, unsigned int len)
@@ -669,7 +676,7 @@ static struct xenbus_watch fe_watch = {
         .callback = frontend_changed,
  };
  
-static int suspend_dev(struct device *dev, void *data)
+static int xenbus_dev_suspend(struct device *dev, pm_message_t state)
  {
         int err = 0;
         struct xenbus_driver *drv;
@@ -682,35 +689,14 @@ static int suspend_dev(struct device *dev, void *data)
         drv = to_xenbus_driver(dev->driver);
         xdev = container_of(dev, struct xenbus_device, dev);
         if (drv->suspend)
-               err = drv->suspend(xdev);
+               err = drv->suspend(xdev, state);
         if (err)
                 printk(KERN_WARNING
                        "xenbus: suspend %s failed: %i\n", dev_name(dev), err);
         return 0;
  }
  
-static int suspend_cancel_dev(struct device *dev, void *data)
-{
-       int err = 0;
-       struct xenbus_driver *drv;
-       struct xenbus_device *xdev;
-
-       DPRINTK("");
-
-       if (dev->driver == NULL)
-               return 0;
-       drv = to_xenbus_driver(dev->driver);
-       xdev = container_of(dev, struct xenbus_device, dev);
-       if (drv->suspend_cancel)
-               err = drv->suspend_cancel(xdev);
-       if (err)
-               printk(KERN_WARNING
-                      "xenbus: suspend_cancel %s failed: %i\n",
-                      dev_name(dev), err);
-       return 0;
-}
-
-static int resume_dev(struct device *dev, void *data)
+static int xenbus_dev_resume(struct device *dev)
  {
         int err;
         struct xenbus_driver *drv;
@@ -755,33 +741,6 @@ static int resume_dev(struct device *dev, void *data)
         return 0;
  }
  
-void xenbus_suspend(void)
-{
-       DPRINTK("");
-
-       bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
-       xenbus_backend_suspend(suspend_dev);
-       xs_suspend();
-}
-EXPORT_SYMBOL_GPL(xenbus_suspend);
-
-void xenbus_resume(void)
-{
-       xb_init_comms();
-       xs_resume();
-       bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
-       xenbus_backend_resume(resume_dev);
-}
-EXPORT_SYMBOL_GPL(xenbus_resume);
-
-void xenbus_suspend_cancel(void)
-{
-       xs_suspend_cancel();
-       bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev);
-       xenbus_backend_resume(suspend_cancel_dev);
-}
-EXPORT_SYMBOL_GPL(xenbus_suspend_cancel);
-
  /* A flag to determine if xenstored is 'ready' (i.e. has started) */
  int xenstored_ready = 0;
  
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c

index e325eab4724d8cb0fab667b26f8c5b104c965c05..eab33f1dbdf7013f451af491b882973ea017f718 100644 (file)
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -673,6 +673,8 @@ void xs_resume(void)
         struct xenbus_watch *watch;
         char token[sizeof(watch) * 2 + 1];
  
+       xb_init_comms();
+
         mutex_unlock(&xs_state.response_mutex);
         mutex_unlock(&xs_state.request_mutex);
         up_write(&xs_state.transaction_mutex);
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c

index 515741a8e6b8ecaae5bcd2d36af6538a2ee77345..6559e0c752ce1494baa127921834e388832cbd75 100644 (file)
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -20,10 +20,27 @@
  MODULE_DESCRIPTION("Xen filesystem");
  MODULE_LICENSE("GPL");
  
+static ssize_t capabilities_read(struct file *file, char __user *buf,
+                                size_t size, loff_t *off)
+{
+       char *tmp = "";
+
+       if (xen_initial_domain())
+               tmp = "control_d\n";
+
+       return simple_read_from_buffer(buf, size, off, tmp, strlen(tmp));
+}
+
+static const struct file_operations capabilities_file_ops = {
+       .read = capabilities_read,
+};
+
  static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
  {
         static struct tree_descr xenfs_files[] = {
-               [2] = {"xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR},
+               [1] = {},
+               { "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
+               { "capabilities", &capabilities_file_ops, S_IRUGO },
                 {""},
         };
  
diff --git a/fs/bio.c b/fs/bio.c

index 98711647ece49548a94409a99ce4b680ee085ca1..740699c4f90c5cf6775873a2ee9b61a5fde4d0c0 100644 (file)
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -26,10 +26,9 @@
  #include <linux/mempool.h>
  #include <linux/workqueue.h>
  #include <linux/blktrace_api.h>
-#include <trace/block.h>
  #include <scsi/sg.h>           /* for struct sg_iovec */
  
-DEFINE_TRACE(block_split);
+#include <trace/events/block.h>
  
  /*
   * Test patch to inline a certain number of bi_io_vec's inside the bio
diff --git a/fs/exec.c b/fs/exec.c

index 895823d0149d9cfbd75bb34b670662dafb7f9ef6..ad4f28c2327ad097e900cf071fba2ac10bf20b04 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
  #include <linux/string.h>
  #include <linux/init.h>
  #include <linux/pagemap.h>
+#include <linux/perf_counter.h>
  #include <linux/highmem.h>
  #include <linux/spinlock.h>
  #include <linux/key.h>
@@ -922,6 +923,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
         task_lock(tsk);
         strlcpy(tsk->comm, buf, sizeof(tsk->comm));
         task_unlock(tsk);
+       perf_counter_comm(tsk);
  }
  
  int flush_old_exec(struct linux_binprm * bprm)
@@ -990,6 +992,13 @@ int flush_old_exec(struct linux_binprm * bprm)
  
         current->personality &= ~bprm->per_clear;
  
+       /*
+        * Flush performance counters when crossing a
+        * security domain:
+        */
+       if (!get_dumpable(current->mm))
+               perf_counter_exit_task(current);
+
         /* An exec changes our domain. We are no longer part of the thread
            group */
  
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c

index 9bca39cf99eeb56d6c32c4f0f9f4c49073683f19..1afa4dd4cae24167a0c4e3f47137a13cf8a6d199 100644 (file)
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -12,20 +12,14 @@
  
  static int loadavg_proc_show(struct seq_file *m, void *v)
  {
-       int a, b, c;
-       unsigned long seq;
+       unsigned long avnrun[3];
  
-       do {
-               seq = read_seqbegin(&xtime_lock);
-               a = avenrun[0] + (FIXED_1/200);
-               b = avenrun[1] + (FIXED_1/200);
-               c = avenrun[2] + (FIXED_1/200);
-       } while (read_seqretry(&xtime_lock, seq));
+       get_avenrun(avnrun, FIXED_1/200, 0);
  
-       seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
-               LOAD_INT(a), LOAD_FRAC(a),
-               LOAD_INT(b), LOAD_FRAC(b),
-               LOAD_INT(c), LOAD_FRAC(c),
+       seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
+               LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
+               LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
+               LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
                 nr_running(), nr_threads,
                 task_active_pid_ns(current)->last_pid);
         return 0;
diff --git a/include/Kbuild b/include/Kbuild

index d8c3e3cbf41603eb983e5922da3ee796c7fece84..fe36accd43283b77f372adcc2c9f815a7910a233 100644 (file)
--- a/include/Kbuild
+++ b/include/Kbuild
@@ -8,3 +8,4 @@ header-y += mtd/
  header-y += rdma/
  header-y += video/
  header-y += drm/
+header-y += xen/
diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h

index 3673a13b6703fd22fea48db3443fb0a92a5fea52..81d3be459efb55e931662ad492c072749538d5e8 100644 (file)
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -134,7 +134,7 @@ static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u)
  #define atomic_long_cmpxchg(l, old, new) \
         (atomic64_cmpxchg((atomic64_t *)(l), (old), (new)))
  #define atomic_long_xchg(v, new) \
-       (atomic64_xchg((atomic64_t *)(l), (new)))
+       (atomic64_xchg((atomic64_t *)(v), (new)))
  
  #else  /*  BITS_PER_LONG == 64  */
  
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h

index 8e6d0ca70aba987b4663db6e512d79236400f183..e410f602cab1b31fdd71681708d17049181026d5 100644 (file)
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -280,17 +280,18 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
  #endif
  
  /*
- * A facility to provide batching of the reload of page tables with the
- * actual context switch code for paravirtualized guests.  By convention,
- * only one of the lazy modes (CPU, MMU) should be active at any given
- * time, entry should never be nested, and entry and exits should always
- * be paired.  This is for sanity of maintaining and reasoning about the
- * kernel code.
+ * A facility to provide batching of the reload of page tables and
+ * other process state with the actual context switch code for
+ * paravirtualized guests.  By convention, only one of the batched
+ * update (lazy) modes (CPU, MMU) should be active at any given time,
+ * entry should never be nested, and entry and exits should always be
+ * paired.  This is for sanity of maintaining and reasoning about the
+ * kernel code.  In this case, the exit (end of the context switch) is
+ * in architecture-specific code, and so doesn't need a generic
+ * definition.
   */
-#ifndef __HAVE_ARCH_ENTER_LAZY_CPU_MODE
-#define arch_enter_lazy_cpu_mode()     do {} while (0)
-#define arch_leave_lazy_cpu_mode()     do {} while (0)
-#define arch_flush_lazy_cpu_mode()     do {} while (0)
+#ifndef __HAVE_ARCH_START_CONTEXT_SWITCH
+#define arch_start_context_switch(prev)        do {} while (0)
  #endif
  
  #ifndef __HAVE_PFNMAP_TRACKING
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h

index 89853bcd27a658ac0aef934c61ba723057933457..f1736ca7922cb0022244087119f73a70e85f8b23 100644 (file)
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -63,7 +63,7 @@
  #define BRANCH_PROFILE()
  #endif
  
-#ifdef CONFIG_EVENT_TRACER
+#ifdef CONFIG_EVENT_TRACING
  #define FTRACE_EVENTS()        VMLINUX_SYMBOL(__start_ftrace_events) = .;      \
                         *(_ftrace_events)                               \
                         VMLINUX_SYMBOL(__stop_ftrace_events) = .;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h

index 88be890ee3c7e402cfd398e5113d2f78bb29338b..51b4b0a5ce8cf00b00fd849ee26e0e7bc504a795 100644 (file)
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -119,7 +119,7 @@ extern int pci_mmcfg_config_num;
  extern int sbf_port;
  extern unsigned long acpi_realmode_flags;
  
-int acpi_register_gsi (u32 gsi, int triggering, int polarity);
+int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity);
  int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
  
  #ifdef CONFIG_X86_IO_APIC
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h

index d960889e92efa17a73ee8038603a622be8dcf8a0..7e4350ece0f8dd495398ca4cbf41c95e995b6525 100644 (file)
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -116,9 +116,9 @@ struct blk_io_trace {
   * The remap event
   */
  struct blk_io_trace_remap {
-       __be32 device;
         __be32 device_from;
-       __be64 sector;
+       __be32 device_to;
+       __be64 sector_from;
  };
  
  enum {
@@ -165,8 +165,9 @@ struct blk_trace {
  
  extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
  extern void blk_trace_shutdown(struct request_queue *);
-extern int do_blk_trace_setup(struct request_queue *q,
-       char *name, dev_t dev, struct blk_user_trace_setup *buts);
+extern int do_blk_trace_setup(struct request_queue *q, char *name,
+                             dev_t dev, struct block_device *bdev,
+                             struct blk_user_trace_setup *buts);
  extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
  
  /**
@@ -193,22 +194,42 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
  extern void blk_add_driver_data(struct request_queue *q, struct request *rq,
                                 void *data, size_t len);
  extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+                          struct block_device *bdev,
                            char __user *arg);
  extern int blk_trace_startstop(struct request_queue *q, int start);
  extern int blk_trace_remove(struct request_queue *q);
+extern int blk_trace_init_sysfs(struct device *dev);
  
  extern struct attribute_group blk_trace_attr_group;
  
  #else /* !CONFIG_BLK_DEV_IO_TRACE */
-#define blk_trace_ioctl(bdev, cmd, arg)                (-ENOTTY)
-#define blk_trace_shutdown(q)                  do { } while (0)
-#define do_blk_trace_setup(q, name, dev, buts) (-ENOTTY)
-#define blk_add_driver_data(q, rq, data, len)  do {} while (0)
-#define blk_trace_setup(q, name, dev, arg)     (-ENOTTY)
-#define blk_trace_startstop(q, start)          (-ENOTTY)
-#define blk_trace_remove(q)                    (-ENOTTY)
-#define blk_add_trace_msg(q, fmt, ...)         do { } while (0)
+# define blk_trace_ioctl(bdev, cmd, arg)               (-ENOTTY)
+# define blk_trace_shutdown(q)                         do { } while (0)
+# define do_blk_trace_setup(q, name, dev, bdev, buts)  (-ENOTTY)
+# define blk_add_driver_data(q, rq, data, len)         do {} while (0)
+# define blk_trace_setup(q, name, dev, bdev, arg)      (-ENOTTY)
+# define blk_trace_startstop(q, start)                 (-ENOTTY)
+# define blk_trace_remove(q)                           (-ENOTTY)
+# define blk_add_trace_msg(q, fmt, ...)                        do { } while (0)
+static inline int blk_trace_init_sysfs(struct device *dev)
+{
+       return 0;
+}
  
  #endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#if defined(CONFIG_EVENT_TRACING) && defined(CONFIG_BLOCK)
+
+static inline int blk_cmd_buf_len(struct request *rq)
+{
+       return blk_pc_request(rq) ? rq->cmd_len * 3 : 1;
+}
+
+extern void blk_dump_cmd(char *buf, struct request *rq);
+extern void blk_fill_rwbs(char *rwbs, u32 rw, int bytes);
+extern void blk_fill_rwbs_rq(char *rwbs, struct request *rq);
+
+#endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */
+
  #endif /* __KERNEL__ */
  #endif
diff --git a/include/linux/compat.h b/include/linux/compat.h

index f2ded21f9a3c37cdef070315659c57d5a066f4f9..af931ee43dd8e43454b9cd9d389ec0bd5b1a7b21 100644 (file)
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -222,6 +222,8 @@ int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from);
  int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from);
  int get_compat_sigevent(struct sigevent *event,
                 const struct compat_sigevent __user *u_event);
+long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
+                                 struct compat_siginfo __user *uinfo);
  
  static inline int compat_timeval_compare(struct compat_timeval *lhs,
                                         struct compat_timeval *rhs)
diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h

index 28d53cb7b5a22c099feb9a2aa19b2e3010e8d124..171ad8aedc835258e152b94b2bc92242045d9c3e 100644 (file)
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -32,6 +32,8 @@ extern void dma_debug_add_bus(struct bus_type *bus);
  
  extern void dma_debug_init(u32 num_entries);
  
+extern int dma_debug_resize_entries(u32 num_entries);
+
  extern void debug_dma_map_page(struct device *dev, struct page *page,
                                size_t offset, size_t size,
                                int direction, dma_addr_t dma_addr,
@@ -91,6 +93,11 @@ static inline void dma_debug_init(u32 num_entries)
  {
  }
  
+static inline int dma_debug_resize_entries(u32 num_entries)
+{
+       return 0;
+}
+
  static inline void debug_dma_map_page(struct device *dev, struct page *page,
                                       size_t offset, size_t size,
                                       int direction, dma_addr_t dma_addr,
diff --git a/include/linux/dmar.h b/include/linux/dmar.h

index e397dc342cdaf1eaf99b022b4523f858d9f3e886..10ff5c498824b5b8dc5e963b07563878a13c2667 100644 (file)
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -108,6 +108,7 @@ struct irte {
  };
  #ifdef CONFIG_INTR_REMAP
  extern int intr_remapping_enabled;
+extern int intr_remapping_supported(void);
  extern int enable_intr_remapping(int);
  extern void disable_intr_remapping(void);
  extern int reenable_intr_remapping(int);
@@ -157,6 +158,8 @@ static inline struct intel_iommu *map_ioapic_to_ir(int apic)
  }
  #define irq_remapped(irq)              (0)
  #define enable_intr_remapping(mode)    (-1)
+#define disable_intr_remapping()       (0)
+#define reenable_intr_remapping(mode)  (0)
  #define intr_remapping_enabled         (0)
  #endif
  
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h

index 8a0c2f221e6b95b448991b1e291610c9e52ea91e..39b95c56587e8f1f8852aed9318148fcc16aac56 100644 (file)
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -233,8 +233,6 @@ extern int ftrace_arch_read_dyn_info(char *buf, int size);
  
  extern int skip_trace(unsigned long ip);
  
-extern void ftrace_release(void *start, unsigned long size);
-
  extern void ftrace_disable_daemon(void);
  extern void ftrace_enable_daemon(void);
  #else
@@ -325,13 +323,8 @@ static inline void __ftrace_enabled_restore(int enabled)
  
  #ifdef CONFIG_FTRACE_MCOUNT_RECORD
  extern void ftrace_init(void);
-extern void ftrace_init_module(struct module *mod,
-                              unsigned long *start, unsigned long *end);
  #else
  static inline void ftrace_init(void) { }
-static inline void
-ftrace_init_module(struct module *mod,
-                  unsigned long *start, unsigned long *end) { }
  #endif
  
  /*
@@ -368,6 +361,7 @@ struct ftrace_ret_stack {
         unsigned long ret;
         unsigned long func;
         unsigned long long calltime;
+       unsigned long long subtime;
  };
  
  /*
@@ -379,8 +373,6 @@ extern void return_to_handler(void);
  
  extern int
  ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth);
-extern void
-ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret);
  
  /*
   * Sometimes we don't want to trace a function with the function
@@ -496,8 +488,15 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk)
  
  extern int ftrace_dump_on_oops;
  
+#ifdef CONFIG_PREEMPT
+#define INIT_TRACE_RECURSION           .trace_recursion = 0,
+#endif
+
  #endif /* CONFIG_TRACING */
  
+#ifndef INIT_TRACE_RECURSION
+#define INIT_TRACE_RECURSION
+#endif
  
  #ifdef CONFIG_HW_BRANCH_TRACER
  
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h

new file mode 100644 (file)

index 0000000..5c093ff
--- /dev/null
+++ b/include/linux/ftrace_event.h
@@ -0,0 +1,172 @@
+#ifndef _LINUX_FTRACE_EVENT_H
+#define _LINUX_FTRACE_EVENT_H
+
+#include <linux/trace_seq.h>
+#include <linux/ring_buffer.h>
+#include <linux/percpu.h>
+
+struct trace_array;
+struct tracer;
+struct dentry;
+
+DECLARE_PER_CPU(struct trace_seq, ftrace_event_seq);
+
+struct trace_print_flags {
+       unsigned long           mask;
+       const char              *name;
+};
+
+const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
+                                  unsigned long flags,
+                                  const struct trace_print_flags *flag_array);
+
+const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+                                    const struct trace_print_flags *symbol_array);
+
+/*
+ * The trace entry - the most basic unit of tracing. This is what
+ * is printed in the end as a single line in the trace output, such as:
+ *
+ *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
+ */
+struct trace_entry {
+       unsigned short          type;
+       unsigned char           flags;
+       unsigned char           preempt_count;
+       int                     pid;
+       int                     tgid;
+};
+
+#define FTRACE_MAX_EVENT                                               \
+       ((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1)
+
+/*
+ * Trace iterator - used by printout routines who present trace
+ * results to users and which routines might sleep, etc:
+ */
+struct trace_iterator {
+       struct trace_array      *tr;
+       struct tracer           *trace;
+       void                    *private;
+       int                     cpu_file;
+       struct mutex            mutex;
+       struct ring_buffer_iter *buffer_iter[NR_CPUS];
+       unsigned long           iter_flags;
+
+       /* The below is zeroed out in pipe_read */
+       struct trace_seq        seq;
+       struct trace_entry      *ent;
+       int                     cpu;
+       u64                     ts;
+
+       loff_t                  pos;
+       long                    idx;
+
+       cpumask_var_t           started;
+};
+
+
+typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
+                                             int flags);
+struct trace_event {
+       struct hlist_node       node;
+       struct list_head        list;
+       int                     type;
+       trace_print_func        trace;
+       trace_print_func        raw;
+       trace_print_func        hex;
+       trace_print_func        binary;
+};
+
+extern int register_ftrace_event(struct trace_event *event);
+extern int unregister_ftrace_event(struct trace_event *event);
+
+/* Return values for print_line callback */
+enum print_line_t {
+       TRACE_TYPE_PARTIAL_LINE = 0,    /* Retry after flushing the seq */
+       TRACE_TYPE_HANDLED      = 1,
+       TRACE_TYPE_UNHANDLED    = 2,    /* Relay to other output functions */
+       TRACE_TYPE_NO_CONSUME   = 3     /* Handled but ask to not consume */
+};
+
+
+struct ring_buffer_event *
+trace_current_buffer_lock_reserve(int type, unsigned long len,
+                                 unsigned long flags, int pc);
+void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
+                                       unsigned long flags, int pc);
+void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
+                                       unsigned long flags, int pc);
+void trace_current_buffer_discard_commit(struct ring_buffer_event *event);
+
+void tracing_record_cmdline(struct task_struct *tsk);
+
+struct ftrace_event_call {
+       struct list_head        list;
+       char                    *name;
+       char                    *system;
+       struct dentry           *dir;
+       struct trace_event      *event;
+       int                     enabled;
+       int                     (*regfunc)(void);
+       void                    (*unregfunc)(void);
+       int                     id;
+       int                     (*raw_init)(void);
+       int                     (*show_format)(struct trace_seq *s);
+       int                     (*define_fields)(void);
+       struct list_head        fields;
+       int                     filter_active;
+       void                    *filter;
+       void                    *mod;
+
+#ifdef CONFIG_EVENT_PROFILE
+       atomic_t        profile_count;
+       int             (*profile_enable)(struct ftrace_event_call *);
+       void            (*profile_disable)(struct ftrace_event_call *);
+#endif
+};
+
+#define MAX_FILTER_PRED                32
+#define MAX_FILTER_STR_VAL     128
+
+extern int init_preds(struct ftrace_event_call *call);
+extern void destroy_preds(struct ftrace_event_call *call);
+extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
+extern int filter_current_check_discard(struct ftrace_event_call *call,
+                                       void *rec,
+                                       struct ring_buffer_event *event);
+
+extern int trace_define_field(struct ftrace_event_call *call, char *type,
+                             char *name, int offset, int size, int is_signed);
+
+#define is_signed_type(type)   (((type)(-1)) < 0)
+
+int trace_set_clr_event(const char *system, const char *event, int set);
+
+/*
+ * The double __builtin_constant_p is because gcc will give us an error
+ * if we try to allocate the static variable to fmt if it is not a
+ * constant. Even with the outer if statement optimizing out.
+ */
+#define event_trace_printk(ip, fmt, args...)                           \
+do {                                                                   \
+       __trace_printk_check_format(fmt, ##args);                       \
+       tracing_record_cmdline(current);                                \
+       if (__builtin_constant_p(fmt)) {                                \
+               static const char *trace_printk_fmt                     \
+                 __attribute__((section("__trace_printk_fmt"))) =      \
+                       __builtin_constant_p(fmt) ? fmt : NULL;         \
+                                                                       \
+               __trace_bprintk(ip, trace_printk_fmt, ##args);          \
+       } else                                                          \
+               __trace_printk(ip, fmt, ##args);                        \
+} while (0)
+
+#define __common_field(type, item, is_signed)                          \
+       ret = trace_define_field(event_call, #type, "common_" #item,    \
+                                offsetof(typeof(field.ent), item),     \
+                                sizeof(field.ent.item), is_signed);    \
+       if (ret)                                                        \
+               return ret;
+
+#endif /* _LINUX_FTRACE_EVENT_H */
diff --git a/include/linux/futex.h b/include/linux/futex.h

index 3bf5bb5a34f9fba43b9248caf434eb6059e6329e..34956c8fdebf8df63ab44c1f84b136406b3d0f34 100644 (file)
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -23,6 +23,8 @@ union ktime;
  #define FUTEX_TRYLOCK_PI       8
  #define FUTEX_WAIT_BITSET      9
  #define FUTEX_WAKE_BITSET      10
+#define FUTEX_WAIT_REQUEUE_PI  11
+#define FUTEX_CMP_REQUEUE_PI   12
  
  #define FUTEX_PRIVATE_FLAG     128
  #define FUTEX_CLOCK_REALTIME   256
@@ -38,6 +40,10 @@ union ktime;
  #define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG)
  #define FUTEX_WAIT_BITSET_PRIVATE      (FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG)
  #define FUTEX_WAKE_BITSET_PRIVATE      (FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG)
+#define FUTEX_WAIT_REQUEUE_PI_PRIVATE  (FUTEX_WAIT_REQUEUE_PI | \
+                                        FUTEX_PRIVATE_FLAG)
+#define FUTEX_CMP_REQUEUE_PI_PRIVATE   (FUTEX_CMP_REQUEUE_PI | \
+                                        FUTEX_PRIVATE_FLAG)
  
  /*
   * Support for robust futexes: the kernel cleans up held futexes at
diff --git a/include/linux/init_task.h b/include/linux/init_task.h

index d87247d2641f5cbf4eb69a90fea392b95684e588..b6b7cf23c2a0eab30d378b551fa0a95cd9b0841b 100644 (file)
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -108,6 +108,15 @@ extern struct group_info init_groups;
  
  extern struct cred init_cred;
  
+#ifdef CONFIG_PERF_COUNTERS
+# define INIT_PERF_COUNTERS(tsk)                                       \
+       .perf_counter_mutex =                                           \
+                __MUTEX_INITIALIZER(tsk.perf_counter_mutex),           \
+       .perf_counter_list = LIST_HEAD_INIT(tsk.perf_counter_list),
+#else
+# define INIT_PERF_COUNTERS(tsk)
+#endif
+
  /*
   *  INIT_TASK is used to set up the first task table, touch at
   * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -171,9 +180,11 @@ extern struct cred init_cred;
         },                                                              \
         .dirties = INIT_PROP_LOCAL_SINGLE(dirties),                     \
         INIT_IDS                                                        \
+       INIT_PERF_COUNTERS(tsk)                                         \
         INIT_TRACE_IRQFLAGS                                             \
         INIT_LOCKDEP                                                    \
         INIT_FTRACE_GRAPH                                               \
+       INIT_TRACE_RECURSION                                            \
  }
  
  
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h

index 91bb76f44f14e6488e909669ba3e3e94fba1104a..ff374ceface08d55c889c5698cdf6bcfda753655 100644 (file)
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -566,6 +566,6 @@ struct irq_desc;
  extern int early_irq_init(void);
  extern int arch_probe_nr_irqs(void);
  extern int arch_early_irq_init(void);
-extern int arch_init_chip_data(struct irq_desc *desc, int cpu);
+extern int arch_init_chip_data(struct irq_desc *desc, int node);
  
  #endif
diff --git a/include/linux/irq.h b/include/linux/irq.h

index b7cbeed972e425b694f1666343f6fc0558ac439e..eedbb8e5e0ccff99c79594b6485fd9259049d088 100644 (file)
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -117,7 +117,7 @@ struct irq_chip {
         void            (*eoi)(unsigned int irq);
  
         void            (*end)(unsigned int irq);
-       void            (*set_affinity)(unsigned int irq,
+       int             (*set_affinity)(unsigned int irq,
                                         const struct cpumask *dest);
         int             (*retrigger)(unsigned int irq);
         int             (*set_type)(unsigned int irq, unsigned int flow_type);
@@ -187,7 +187,7 @@ struct irq_desc {
         spinlock_t              lock;
  #ifdef CONFIG_SMP
         cpumask_var_t           affinity;
-       unsigned int            cpu;
+       unsigned int            node;
  #ifdef CONFIG_GENERIC_PENDING_IRQ
         cpumask_var_t           pending_mask;
  #endif
@@ -201,26 +201,23 @@ struct irq_desc {
  } ____cacheline_internodealigned_in_smp;
  
  extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
-                                       struct irq_desc *desc, int cpu);
+                                       struct irq_desc *desc, int node);
  extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc);
  
  #ifndef CONFIG_SPARSE_IRQ
  extern struct irq_desc irq_desc[NR_IRQS];
-#else /* CONFIG_SPARSE_IRQ */
-extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
-#endif /* CONFIG_SPARSE_IRQ */
-
-extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
+#endif
  
-static inline struct irq_desc *
-irq_remap_to_desc(unsigned int irq, struct irq_desc *desc)
-{
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-       return irq_to_desc(irq);
+#ifdef CONFIG_NUMA_IRQ_DESC
+extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int node);
  #else
+static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
+{
         return desc;
-#endif
  }
+#endif
+
+extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node);
  
  /*
   * Migration helpers for obsolete names, they will go away:
@@ -386,7 +383,7 @@ extern void set_irq_noprobe(unsigned int irq);
  extern void set_irq_probe(unsigned int irq);
  
  /* Handle dynamic irq creation and destruction */
-extern unsigned int create_irq_nr(unsigned int irq_want);
+extern unsigned int create_irq_nr(unsigned int irq_want, int node);
  extern int create_irq(void);
  extern void destroy_irq(unsigned int irq);
  
@@ -424,47 +421,48 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
  
  #ifdef CONFIG_SMP
  /**
- * init_alloc_desc_masks - allocate cpumasks for irq_desc
+ * alloc_desc_masks - allocate cpumasks for irq_desc
   * @desc:      pointer to irq_desc struct
   * @cpu:       cpu which will be handling the cpumasks
   * @boot:      true if need bootmem
   *
   * Allocates affinity and pending_mask cpumask if required.
   * Returns true if successful (or not required).
- * Side effect: affinity has all bits set, pending_mask has all bits clear.
   */
-static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
+static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
                                                                 bool boot)
  {
-       int node;
-
+#ifdef CONFIG_CPUMASK_OFFSTACK
         if (boot) {
                 alloc_bootmem_cpumask_var(&desc->affinity);
-               cpumask_setall(desc->affinity);
  
  #ifdef CONFIG_GENERIC_PENDING_IRQ
                 alloc_bootmem_cpumask_var(&desc->pending_mask);
-               cpumask_clear(desc->pending_mask);
  #endif
                 return true;
         }
  
-       node = cpu_to_node(cpu);
-
         if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
                 return false;
-       cpumask_setall(desc->affinity);
  
  #ifdef CONFIG_GENERIC_PENDING_IRQ
         if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
                 free_cpumask_var(desc->affinity);
                 return false;
         }
-       cpumask_clear(desc->pending_mask);
+#endif
  #endif
         return true;
  }
  
+static inline void init_desc_masks(struct irq_desc *desc)
+{
+       cpumask_setall(desc->affinity);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+       cpumask_clear(desc->pending_mask);
+#endif
+}
+
  /**
   * init_copy_desc_masks - copy cpumasks for irq_desc
   * @old_desc:  pointer to old irq_desc struct
@@ -478,7 +476,7 @@ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
  static inline void init_copy_desc_masks(struct irq_desc *old_desc,
                                         struct irq_desc *new_desc)
  {
-#ifdef CONFIG_CPUMASKS_OFFSTACK
+#ifdef CONFIG_CPUMASK_OFFSTACK
         cpumask_copy(new_desc->affinity, old_desc->affinity);
  
  #ifdef CONFIG_GENERIC_PENDING_IRQ
@@ -499,12 +497,16 @@ static inline void free_desc_masks(struct irq_desc *old_desc,
  
  #else /* !CONFIG_SMP */
  
-static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
+static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
                                                                 bool boot)
  {
         return true;
  }
  
+static inline void init_desc_masks(struct irq_desc *desc)
+{
+}
+
  static inline void init_copy_desc_masks(struct irq_desc *old_desc,
                                         struct irq_desc *new_desc)
  {
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h

index 0c8b89f28a95eeccc1956c132777621bacf4d4c5..a77c6007dc99a92a636b927c54816ca8d1e8b954 100644 (file)
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -81,7 +81,12 @@ static inline unsigned int kstat_irqs(unsigned int irq)
         return sum;
  }
  
+
+/*
+ * Lock/unlock the current runqueue - to extract task statistics:
+ */
  extern unsigned long long task_delta_exec(struct task_struct *);
+
  extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
  extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
  extern void account_steal_time(cputime_t);
diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h

new file mode 100644 (file)

index 0000000..b616d39
--- /dev/null
+++ b/include/linux/kmemtrace.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2008 Eduard - Gabriel Munteanu
+ *
+ * This file is released under GPL version 2.
+ */
+
+#ifndef _LINUX_KMEMTRACE_H
+#define _LINUX_KMEMTRACE_H
+
+#ifdef __KERNEL__
+
+#include <trace/events/kmem.h>
+
+#ifdef CONFIG_KMEMTRACE
+extern void kmemtrace_init(void);
+#else
+static inline void kmemtrace_init(void)
+{
+}
+#endif
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_KMEMTRACE_H */
+
diff --git a/include/linux/mm.h b/include/linux/mm.h

index bff1f0d475c7593240d5a464ae8ecc7c03e07299..9772d6cbfc82287eddf35c9c49aba4932c774ac1 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -19,6 +19,7 @@ struct anon_vma;
  struct file_ra_state;
  struct user_struct;
  struct writeback_control;
+struct rlimit;
  
  #ifndef CONFIG_DISCONTIGMEM          /* Don't use mapnrs, do it properly */
  extern unsigned long max_mapnr;
@@ -1031,8 +1032,6 @@ extern void add_active_range(unsigned int nid, unsigned long start_pfn,
                                         unsigned long end_pfn);
  extern void remove_active_range(unsigned int nid, unsigned long start_pfn,
                                         unsigned long end_pfn);
-extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn,
-                                       unsigned long end_pfn);
  extern void remove_all_active_ranges(void);
  extern unsigned long absent_pages_in_range(unsigned long start_pfn,
                                                 unsigned long end_pfn);
@@ -1319,8 +1318,8 @@ int vmemmap_populate_basepages(struct page *start_page,
  int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
  void vmemmap_populate_print_last(void);
  
-extern void *alloc_locked_buffer(size_t size);
-extern void free_locked_buffer(void *buffer, size_t size);
-extern void release_locked_buffer(void *buffer, size_t size);
+extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
+                                size_t size);
+extern void refund_locked_memory(struct mm_struct *mm, size_t size);
  #endif /* __KERNEL__ */
  #endif /* _LINUX_MM_H */
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h

index 3d1b7bde128361d0905bfbb6b28375c91d6465c3..97491f78b08cd72138ed25750145275ca1dabb90 100644 (file)
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -30,6 +30,8 @@ extern unsigned int kmmio_count;
  
  extern int register_kmmio_probe(struct kmmio_probe *p);
  extern void unregister_kmmio_probe(struct kmmio_probe *p);
+extern int kmmio_init(void);
+extern void kmmio_cleanup(void);
  
  #ifdef CONFIG_MMIOTRACE
  /* kmmio is active by some kmmio_probes? */
diff --git a/include/linux/module.h b/include/linux/module.h

index 627ac082e2a64ad6734a665b270a3d11705e86d2..a8f2c0aa4c328af87718285f715bc72b19d7f55f 100644 (file)
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -337,6 +337,14 @@ struct module
         const char **trace_bprintk_fmt_start;
         unsigned int num_trace_bprintk_fmt;
  #endif
+#ifdef CONFIG_EVENT_TRACING
+       struct ftrace_event_call *trace_events;
+       unsigned int num_trace_events;
+#endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+       unsigned long *ftrace_callsites;
+       unsigned int num_ftrace_callsites;
+#endif
  
  #ifdef CONFIG_MODULE_UNLOAD
         /* What modules depend on me? */
diff --git a/include/linux/mutex.h b/include/linux/mutex.h

index 3069ec7e0ab84ca54a282c72d73385e44e69d63a..878cab4f5fcc5db95585184c22aea5d905a34295 100644 (file)
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -150,5 +150,6 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
   */
  extern int mutex_trylock(struct mutex *lock);
  extern void mutex_unlock(struct mutex *lock);
+extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
  
  #endif
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h

new file mode 100644 (file)

index 0000000..6e13395
--- /dev/null
+++ b/include/linux/perf_counter.h
@@ -0,0 +1,697 @@
+/*
+ *  Performance counters:
+ *
+ *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
+ *    Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
+ *    Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
+ *
+ *  Data type definitions, declarations, prototypes.
+ *
+ *    Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#ifndef _LINUX_PERF_COUNTER_H
+#define _LINUX_PERF_COUNTER_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+#include <asm/byteorder.h>
+
+/*
+ * User-space ABI bits:
+ */
+
+/*
+ * attr.type
+ */
+enum perf_type_id {
+       PERF_TYPE_HARDWARE                      = 0,
+       PERF_TYPE_SOFTWARE                      = 1,
+       PERF_TYPE_TRACEPOINT                    = 2,
+       PERF_TYPE_HW_CACHE                      = 3,
+       PERF_TYPE_RAW                           = 4,
+
+       PERF_TYPE_MAX,                          /* non-ABI */
+};
+
+/*
+ * Generalized performance counter event types, used by the
+ * attr.event_id parameter of the sys_perf_counter_open()
+ * syscall:
+ */
+enum perf_hw_id {
+       /*
+        * Common hardware events, generalized by the kernel:
+        */
+       PERF_COUNT_HW_CPU_CYCLES                = 0,
+       PERF_COUNT_HW_INSTRUCTIONS              = 1,
+       PERF_COUNT_HW_CACHE_REFERENCES          = 2,
+       PERF_COUNT_HW_CACHE_MISSES              = 3,
+       PERF_COUNT_HW_BRANCH_INSTRUCTIONS       = 4,
+       PERF_COUNT_HW_BRANCH_MISSES             = 5,
+       PERF_COUNT_HW_BUS_CYCLES                = 6,
+
+       PERF_COUNT_HW_MAX,                      /* non-ABI */
+};
+
+/*
+ * Generalized hardware cache counters:
+ *
+ *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
+ *       { read, write, prefetch } x
+ *       { accesses, misses }
+ */
+enum perf_hw_cache_id {
+       PERF_COUNT_HW_CACHE_L1D                 = 0,
+       PERF_COUNT_HW_CACHE_L1I                 = 1,
+       PERF_COUNT_HW_CACHE_LL                  = 2,
+       PERF_COUNT_HW_CACHE_DTLB                = 3,
+       PERF_COUNT_HW_CACHE_ITLB                = 4,
+       PERF_COUNT_HW_CACHE_BPU                 = 5,
+
+       PERF_COUNT_HW_CACHE_MAX,                /* non-ABI */
+};
+
+enum perf_hw_cache_op_id {
+       PERF_COUNT_HW_CACHE_OP_READ             = 0,
+       PERF_COUNT_HW_CACHE_OP_WRITE            = 1,
+       PERF_COUNT_HW_CACHE_OP_PREFETCH         = 2,
+
+       PERF_COUNT_HW_CACHE_OP_MAX,             /* non-ABI */
+};
+
+enum perf_hw_cache_op_result_id {
+       PERF_COUNT_HW_CACHE_RESULT_ACCESS       = 0,
+       PERF_COUNT_HW_CACHE_RESULT_MISS         = 1,
+
+       PERF_COUNT_HW_CACHE_RESULT_MAX,         /* non-ABI */
+};
+
+/*
+ * Special "software" counters provided by the kernel, even if the hardware
+ * does not support performance counters. These counters measure various
+ * physical and sw events of the kernel (and allow the profiling of them as
+ * well):
+ */
+enum perf_sw_ids {
+       PERF_COUNT_SW_CPU_CLOCK                 = 0,
+       PERF_COUNT_SW_TASK_CLOCK                = 1,
+       PERF_COUNT_SW_PAGE_FAULTS               = 2,
+       PERF_COUNT_SW_CONTEXT_SWITCHES          = 3,
+       PERF_COUNT_SW_CPU_MIGRATIONS            = 4,
+       PERF_COUNT_SW_PAGE_FAULTS_MIN           = 5,
+       PERF_COUNT_SW_PAGE_FAULTS_MAJ           = 6,
+
+       PERF_COUNT_SW_MAX,                      /* non-ABI */
+};
+
+/*
+ * Bits that can be set in attr.sample_type to request information
+ * in the overflow packets.
+ */
+enum perf_counter_sample_format {
+       PERF_SAMPLE_IP                          = 1U << 0,
+       PERF_SAMPLE_TID                         = 1U << 1,
+       PERF_SAMPLE_TIME                        = 1U << 2,
+       PERF_SAMPLE_ADDR                        = 1U << 3,
+       PERF_SAMPLE_GROUP                       = 1U << 4,
+       PERF_SAMPLE_CALLCHAIN                   = 1U << 5,
+       PERF_SAMPLE_ID                          = 1U << 6,
+       PERF_SAMPLE_CPU                         = 1U << 7,
+       PERF_SAMPLE_PERIOD                      = 1U << 8,
+};
+
+/*
+ * Bits that can be set in attr.read_format to request that
+ * reads on the counter should return the indicated quantities,
+ * in increasing order of bit value, after the counter value.
+ */
+enum perf_counter_read_format {
+       PERF_FORMAT_TOTAL_TIME_ENABLED          = 1U << 0,
+       PERF_FORMAT_TOTAL_TIME_RUNNING          = 1U << 1,
+       PERF_FORMAT_ID                          = 1U << 2,
+};
+
+/*
+ * Hardware event to monitor via a performance monitoring counter:
+ */
+struct perf_counter_attr {
+       /*
+        * Major type: hardware/software/tracepoint/etc.
+        */
+       __u32                   type;
+       __u32                   __reserved_1;
+
+       /*
+        * Type specific configuration information.
+        */
+       __u64                   config;
+
+       union {
+               __u64           sample_period;
+               __u64           sample_freq;
+       };
+
+       __u64                   sample_type;
+       __u64                   read_format;
+
+       __u64                   disabled       :  1, /* off by default        */
+                               inherit        :  1, /* children inherit it   */
+                               pinned         :  1, /* must always be on PMU */
+                               exclusive      :  1, /* only group on PMU     */
+                               exclude_user   :  1, /* don't count user      */
+                               exclude_kernel :  1, /* ditto kernel          */
+                               exclude_hv     :  1, /* ditto hypervisor      */
+                               exclude_idle   :  1, /* don't count when idle */
+                               mmap           :  1, /* include mmap data     */
+                               comm           :  1, /* include comm data     */
+                               freq           :  1, /* use freq, not period  */
+
+                               __reserved_2   : 53;
+
+       __u32                   wakeup_events;  /* wakeup every n events */
+       __u32                   __reserved_3;
+
+       __u64                   __reserved_4;
+};
+
+/*
+ * Ioctls that can be done on a perf counter fd:
+ */
+#define PERF_COUNTER_IOC_ENABLE                _IO ('$', 0)
+#define PERF_COUNTER_IOC_DISABLE       _IO ('$', 1)
+#define PERF_COUNTER_IOC_REFRESH       _IO ('$', 2)
+#define PERF_COUNTER_IOC_RESET         _IO ('$', 3)
+#define PERF_COUNTER_IOC_PERIOD                _IOW('$', 4, u64)
+
+enum perf_counter_ioc_flags {
+       PERF_IOC_FLAG_GROUP             = 1U << 0,
+};
+
+/*
+ * Structure of the page that can be mapped via mmap
+ */
+struct perf_counter_mmap_page {
+       __u32   version;                /* version number of this structure */
+       __u32   compat_version;         /* lowest version this is compat with */
+
+       /*
+        * Bits needed to read the hw counters in user-space.
+        *
+        *   u32 seq;
+        *   s64 count;
+        *
+        *   do {
+        *     seq = pc->lock;
+        *
+        *     barrier()
+        *     if (pc->index) {
+        *       count = pmc_read(pc->index - 1);
+        *       count += pc->offset;
+        *     } else
+        *       goto regular_read;
+        *
+        *     barrier();
+        *   } while (pc->lock != seq);
+        *
+        * NOTE: for obvious reason this only works on self-monitoring
+        *       processes.
+        */
+       __u32   lock;                   /* seqlock for synchronization */
+       __u32   index;                  /* hardware counter identifier */
+       __s64   offset;                 /* add to hardware counter value */
+
+       /*
+        * Control data for the mmap() data buffer.
+        *
+        * User-space reading this value should issue an rmb(), on SMP capable
+        * platforms, after reading this value -- see perf_counter_wakeup().
+        */
+       __u64   data_head;              /* head in the data section */
+};
+
+#define PERF_EVENT_MISC_CPUMODE_MASK           (3 << 0)
+#define PERF_EVENT_MISC_CPUMODE_UNKNOWN                (0 << 0)
+#define PERF_EVENT_MISC_KERNEL                 (1 << 0)
+#define PERF_EVENT_MISC_USER                   (2 << 0)
+#define PERF_EVENT_MISC_HYPERVISOR             (3 << 0)
+#define PERF_EVENT_MISC_OVERFLOW               (1 << 2)
+
+struct perf_event_header {
+       __u32   type;
+       __u16   misc;
+       __u16   size;
+};
+
+enum perf_event_type {
+
+       /*
+        * The MMAP events record the PROT_EXEC mappings so that we can
+        * correlate userspace IPs to code. They have the following structure:
+        *
+        * struct {
+        *      struct perf_event_header        header;
+        *
+        *      u32                             pid, tid;
+        *      u64                             addr;
+        *      u64                             len;
+        *      u64                             pgoff;
+        *      char                            filename[];
+        * };
+        */
+       PERF_EVENT_MMAP                 = 1,
+
+       /*
+        * struct {
+        *      struct perf_event_header        header;
+        *
+        *      u32                             pid, tid;
+        *      char                            comm[];
+        * };
+        */
+       PERF_EVENT_COMM                 = 3,
+
+       /*
+        * struct {
+        *      struct perf_event_header        header;
+        *      u64                             time;
+        *      u64                             id;
+        *      u64                             sample_period;
+        * };
+        */
+       PERF_EVENT_PERIOD               = 4,
+
+       /*
+        * struct {
+        *      struct perf_event_header        header;
+        *      u64                             time;
+        *      u64                             id;
+        * };
+        */
+       PERF_EVENT_THROTTLE             = 5,
+       PERF_EVENT_UNTHROTTLE           = 6,
+
+       /*
+        * struct {
+        *      struct perf_event_header        header;
+        *      u32                             pid, ppid;
+        * };
+        */
+       PERF_EVENT_FORK                 = 7,
+
+       /*
+        * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
+        * will be PERF_RECORD_*
+        *
+        * struct {
+        *      struct perf_event_header        header;
+        *
+        *      { u64                   ip;       } && PERF_RECORD_IP
+        *      { u32                   pid, tid; } && PERF_RECORD_TID
+        *      { u64                   time;     } && PERF_RECORD_TIME
+        *      { u64                   addr;     } && PERF_RECORD_ADDR
+        *      { u64                   config;   } && PERF_RECORD_CONFIG
+        *      { u32                   cpu, res; } && PERF_RECORD_CPU
+        *
+        *      { u64                   nr;
+        *        { u64 id, val; }      cnt[nr];  } && PERF_RECORD_GROUP
+        *
+        *      { u16                   nr,
+        *                              hv,
+        *                              kernel,
+        *                              user;
+        *        u64                   ips[nr];  } && PERF_RECORD_CALLCHAIN
+        * };
+        */
+};
+
+#ifdef __KERNEL__
+/*
+ * Kernel-internal data types and definitions:
+ */
+
+#ifdef CONFIG_PERF_COUNTERS
+# include <asm/perf_counter.h>
+#endif
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/hrtimer.h>
+#include <linux/fs.h>
+#include <linux/pid_namespace.h>
+#include <asm/atomic.h>
+
+struct task_struct;
+
+/**
+ * struct hw_perf_counter - performance counter hardware details:
+ */
+struct hw_perf_counter {
+#ifdef CONFIG_PERF_COUNTERS
+       union {
+               struct { /* hardware */
+                       u64             config;
+                       unsigned long   config_base;
+                       unsigned long   counter_base;
+                       int             idx;
+               };
+               union { /* software */
+                       atomic64_t      count;
+                       struct hrtimer  hrtimer;
+               };
+       };
+       atomic64_t                      prev_count;
+       u64                             sample_period;
+       u64                             last_period;
+       atomic64_t                      period_left;
+       u64                             interrupts;
+
+       u64                             freq_count;
+       u64                             freq_interrupts;
+       u64                             freq_stamp;
+#endif
+};
+
+struct perf_counter;
+
+/**
+ * struct pmu - generic performance monitoring unit
+ */
+struct pmu {
+       int (*enable)                   (struct perf_counter *counter);
+       void (*disable)                 (struct perf_counter *counter);
+       void (*read)                    (struct perf_counter *counter);
+       void (*unthrottle)              (struct perf_counter *counter);
+};
+
+/**
+ * enum perf_counter_active_state - the states of a counter
+ */
+enum perf_counter_active_state {
+       PERF_COUNTER_STATE_ERROR        = -2,
+       PERF_COUNTER_STATE_OFF          = -1,
+       PERF_COUNTER_STATE_INACTIVE     =  0,
+       PERF_COUNTER_STATE_ACTIVE       =  1,
+};
+
+struct file;
+
+struct perf_mmap_data {
+       struct rcu_head                 rcu_head;
+       int                             nr_pages;       /* nr of data pages  */
+       int                             nr_locked;      /* nr pages mlocked  */
+
+       atomic_t                        poll;           /* POLL_ for wakeups */
+       atomic_t                        events;         /* event limit       */
+
+       atomic_long_t                   head;           /* write position    */
+       atomic_long_t                   done_head;      /* completed head    */
+
+       atomic_t                        lock;           /* concurrent writes */
+
+       atomic_t                        wakeup;         /* needs a wakeup    */
+
+       struct perf_counter_mmap_page   *user_page;
+       void                            *data_pages[0];
+};
+
+struct perf_pending_entry {
+       struct perf_pending_entry *next;
+       void (*func)(struct perf_pending_entry *);
+};
+
+/**
+ * struct perf_counter - performance counter kernel representation:
+ */
+struct perf_counter {
+#ifdef CONFIG_PERF_COUNTERS
+       struct list_head                list_entry;
+       struct list_head                event_entry;
+       struct list_head                sibling_list;
+       int                             nr_siblings;
+       struct perf_counter             *group_leader;
+       const struct pmu                *pmu;
+
+       enum perf_counter_active_state  state;
+       atomic64_t                      count;
+
+       /*
+        * These are the total time in nanoseconds that the counter
+        * has been enabled (i.e. eligible to run, and the task has
+        * been scheduled in, if this is a per-task counter)
+        * and running (scheduled onto the CPU), respectively.
+        *
+        * They are computed from tstamp_enabled, tstamp_running and
+        * tstamp_stopped when the counter is in INACTIVE or ACTIVE state.
+        */
+       u64                             total_time_enabled;
+       u64                             total_time_running;
+
+       /*
+        * These are timestamps used for computing total_time_enabled
+        * and total_time_running when the counter is in INACTIVE or
+        * ACTIVE state, measured in nanoseconds from an arbitrary point
+        * in time.
+        * tstamp_enabled: the notional time when the counter was enabled
+        * tstamp_running: the notional time when the counter was scheduled on
+        * tstamp_stopped: in INACTIVE state, the notional time when the
+        *      counter was scheduled off.
+        */
+       u64                             tstamp_enabled;
+       u64                             tstamp_running;
+       u64                             tstamp_stopped;
+
+       struct perf_counter_attr        attr;
+       struct hw_perf_counter          hw;
+
+       struct perf_counter_context     *ctx;
+       struct file                     *filp;
+
+       /*
+        * These accumulate total time (in nanoseconds) that children
+        * counters have been enabled and running, respectively.
+        */
+       atomic64_t                      child_total_time_enabled;
+       atomic64_t                      child_total_time_running;
+
+       /*
+        * Protect attach/detach and child_list:
+        */
+       struct mutex                    child_mutex;
+       struct list_head                child_list;
+       struct perf_counter             *parent;
+
+       int                             oncpu;
+       int                             cpu;
+
+       struct list_head                owner_entry;
+       struct task_struct              *owner;
+
+       /* mmap bits */
+       struct mutex                    mmap_mutex;
+       atomic_t                        mmap_count;
+       struct perf_mmap_data           *data;
+
+       /* poll related */
+       wait_queue_head_t               waitq;
+       struct fasync_struct            *fasync;
+
+       /* delayed work for NMIs and such */
+       int                             pending_wakeup;
+       int                             pending_kill;
+       int                             pending_disable;
+       struct perf_pending_entry       pending;
+
+       atomic_t                        event_limit;
+
+       void (*destroy)(struct perf_counter *);
+       struct rcu_head                 rcu_head;
+
+       struct pid_namespace            *ns;
+       u64                             id;
+#endif
+};
+
+/**
+ * struct perf_counter_context - counter context structure
+ *
+ * Used as a container for task counters and CPU counters as well:
+ */
+struct perf_counter_context {
+       /*
+        * Protect the states of the counters in the list,
+        * nr_active, and the list:
+        */
+       spinlock_t                      lock;
+       /*
+        * Protect the list of counters.  Locking either mutex or lock
+        * is sufficient to ensure the list doesn't change; to change
+        * the list you need to lock both the mutex and the spinlock.
+        */
+       struct mutex                    mutex;
+
+       struct list_head                counter_list;
+       struct list_head                event_list;
+       int                             nr_counters;
+       int                             nr_active;
+       int                             is_active;
+       atomic_t                        refcount;
+       struct task_struct              *task;
+
+       /*
+        * Context clock, runs when context enabled.
+        */
+       u64                             time;
+       u64                             timestamp;
+
+       /*
+        * These fields let us detect when two contexts have both
+        * been cloned (inherited) from a common ancestor.
+        */
+       struct perf_counter_context     *parent_ctx;
+       u64                             parent_gen;
+       u64                             generation;
+       int                             pin_count;
+       struct rcu_head                 rcu_head;
+};
+
+/**
+ * struct perf_counter_cpu_context - per cpu counter context structure
+ */
+struct perf_cpu_context {
+       struct perf_counter_context     ctx;
+       struct perf_counter_context     *task_ctx;
+       int                             active_oncpu;
+       int                             max_pertask;
+       int                             exclusive;
+
+       /*
+        * Recursion avoidance:
+        *
+        * task, softirq, irq, nmi context
+        */
+       int                             recursion[4];
+};
+
+#ifdef CONFIG_PERF_COUNTERS
+
+/*
+ * Set by architecture code:
+ */
+extern int perf_max_counters;
+
+extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter);
+
+extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
+extern void perf_counter_task_sched_out(struct task_struct *task,
+                                       struct task_struct *next, int cpu);
+extern void perf_counter_task_tick(struct task_struct *task, int cpu);
+extern int perf_counter_init_task(struct task_struct *child);
+extern void perf_counter_exit_task(struct task_struct *child);
+extern void perf_counter_free_task(struct task_struct *task);
+extern void perf_counter_do_pending(void);
+extern void perf_counter_print_debug(void);
+extern void __perf_disable(void);
+extern bool __perf_enable(void);
+extern void perf_disable(void);
+extern void perf_enable(void);
+extern int perf_counter_task_disable(void);
+extern int perf_counter_task_enable(void);
+extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
+              struct perf_cpu_context *cpuctx,
+              struct perf_counter_context *ctx, int cpu);
+extern void perf_counter_update_userpage(struct perf_counter *counter);
+
+struct perf_sample_data {
+       struct pt_regs                  *regs;
+       u64                             addr;
+       u64                             period;
+};
+
+extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
+                                struct perf_sample_data *data);
+
+/*
+ * Return 1 for a software counter, 0 for a hardware counter
+ */
+static inline int is_software_counter(struct perf_counter *counter)
+{
+       return (counter->attr.type != PERF_TYPE_RAW) &&
+               (counter->attr.type != PERF_TYPE_HARDWARE);
+}
+
+extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
+
+extern void __perf_counter_mmap(struct vm_area_struct *vma);
+
+static inline void perf_counter_mmap(struct vm_area_struct *vma)
+{
+       if (vma->vm_flags & VM_EXEC)
+               __perf_counter_mmap(vma);
+}
+
+extern void perf_counter_comm(struct task_struct *tsk);
+extern void perf_counter_fork(struct task_struct *tsk);
+
+extern void perf_counter_task_migration(struct task_struct *task, int cpu);
+
+#define MAX_STACK_DEPTH                        255
+
+struct perf_callchain_entry {
+       u16                             nr;
+       u16                             hv;
+       u16                             kernel;
+       u16                             user;
+       u64                             ip[MAX_STACK_DEPTH];
+};
+
+extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+
+extern int sysctl_perf_counter_paranoid;
+extern int sysctl_perf_counter_mlock;
+extern int sysctl_perf_counter_sample_rate;
+
+extern void perf_counter_init(void);
+
+#ifndef perf_misc_flags
+#define perf_misc_flags(regs)  (user_mode(regs) ? PERF_EVENT_MISC_USER : \
+                                PERF_EVENT_MISC_KERNEL)
+#define perf_instruction_pointer(regs) instruction_pointer(regs)
+#endif
+
+#else
+static inline void
+perf_counter_task_sched_in(struct task_struct *task, int cpu)          { }
+static inline void
+perf_counter_task_sched_out(struct task_struct *task,
+                           struct task_struct *next, int cpu)          { }
+static inline void
+perf_counter_task_tick(struct task_struct *task, int cpu)              { }
+static inline int perf_counter_init_task(struct task_struct *child)    { return 0; }
+static inline void perf_counter_exit_task(struct task_struct *child)   { }
+static inline void perf_counter_free_task(struct task_struct *task)    { }
+static inline void perf_counter_do_pending(void)                       { }
+static inline void perf_counter_print_debug(void)                      { }
+static inline void perf_disable(void)                                  { }
+static inline void perf_enable(void)                                   { }
+static inline int perf_counter_task_disable(void)      { return -EINVAL; }
+static inline int perf_counter_task_enable(void)       { return -EINVAL; }
+
+static inline void
+perf_swcounter_event(u32 event, u64 nr, int nmi,
+                    struct pt_regs *regs, u64 addr)                    { }
+
+static inline void perf_counter_mmap(struct vm_area_struct *vma)       { }
+static inline void perf_counter_comm(struct task_struct *tsk)          { }
+static inline void perf_counter_fork(struct task_struct *tsk)          { }
+static inline void perf_counter_init(void)                             { }
+static inline void perf_counter_task_migration(struct task_struct *task,
+                                              int cpu)                 { }
+#endif
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/prctl.h b/include/linux/prctl.h

index 48d887e3c6e737317a7b29d2a418f8383ae910cb..b00df4c79c6330b83b0f6eb631f9ea8e2e62bc5a 100644 (file)
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -85,4 +85,7 @@
  #define PR_SET_TIMERSLACK 29
  #define PR_GET_TIMERSLACK 30
  
+#define PR_TASK_PERF_COUNTERS_DISABLE          31
+#define PR_TASK_PERF_COUNTERS_ENABLE           32
+
  #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h

index 67c15653fc230c9341ed1131c1e846156c6772bb..59e133d39d5091c4e5628212b4d563756fe02bec 100644 (file)
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -95,7 +95,6 @@ extern void __ptrace_link(struct task_struct *child,
                           struct task_struct *new_parent);
  extern void __ptrace_unlink(struct task_struct *child);
  extern void exit_ptrace(struct task_struct *tracer);
-extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags);
  #define PTRACE_MODE_READ   1
  #define PTRACE_MODE_ATTACH 2
  /* Returns 0 on success, -errno on denial. */
@@ -327,15 +326,6 @@ static inline void user_enable_block_step(struct task_struct *task)
  #define arch_ptrace_untrace(task)              do { } while (0)
  #endif
  
-#ifndef arch_ptrace_fork
-/*
- * Do machine-specific work to initialize a new task.
- *
- * This is called from copy_process().
- */
-#define arch_ptrace_fork(child, clone_flags)   do { } while (0)
-#endif
-
  extern int task_current_syscall(struct task_struct *target, long *callno,
                                 unsigned long args[6], unsigned int maxargs,
                                 unsigned long *sp, unsigned long *pc);
diff --git a/include/linux/rculist.h b/include/linux/rculist.h

index e649bd3f2c976c3f5bed58c067c351a336403e75..5710f43bbc9ec0aa42eabd74f851eda4a9a6f213 100644 (file)
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -198,6 +198,32 @@ static inline void list_splice_init_rcu(struct list_head *list,
         at->prev = last;
  }
  
+/**
+ * list_entry_rcu - get the struct for this entry
+ * @ptr:        the &struct list_head pointer.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_struct within the struct.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
+ */
+#define list_entry_rcu(ptr, type, member) \
+       container_of(rcu_dereference(ptr), type, member)
+
+/**
+ * list_first_entry_rcu - get the first element from a list
+ * @ptr:        the list head to take the element from.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_struct within the struct.
+ *
+ * Note, that list is expected to be not empty.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
+ */
+#define list_first_entry_rcu(ptr, type, member) \
+       list_entry_rcu((ptr)->next, type, member)
+
  #define __list_for_each_rcu(pos, head) \
         for (pos = rcu_dereference((head)->next); \
                 pos != (head); \
@@ -214,9 +240,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
   * as long as the traversal is guarded by rcu_read_lock().
   */
  #define list_for_each_entry_rcu(pos, head, member) \
-       for (pos = list_entry(rcu_dereference((head)->next), typeof(*pos), member); \
+       for (pos = list_entry_rcu((head)->next, typeof(*pos), member); \
                 prefetch(pos->member.next), &pos->member != (head); \
-               pos = list_entry(rcu_dereference(pos->member.next), typeof(*pos), member))
+               pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
  
  
  /**
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h

index 58b2aa5312b9caaae76f7f9dcb7e9cfcc7f06605..5a5153806c42562c730e8911099fea05c57bf4ad 100644 (file)
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -161,8 +161,15 @@ struct rcu_data {
         unsigned long offline_fqs;      /* Kicked due to being offline. */
         unsigned long resched_ipi;      /* Sent a resched IPI. */
  
-       /* 5) For future __rcu_pending statistics. */
+       /* 5) __rcu_pending() statistics. */
         long n_rcu_pending;             /* rcu_pending() calls since boot. */
+       long n_rp_qs_pending;
+       long n_rp_cb_ready;
+       long n_rp_cpu_needs_gp;
+       long n_rp_gp_completed;
+       long n_rp_gp_started;
+       long n_rp_need_fqs;
+       long n_rp_need_nothing;
  
         int cpu;
  };
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h

index e1b7b2173885f8f14f4a8da3979d0a6eff08e186..8670f1575fe19abe7793b700aa70d525080c83bf 100644 (file)
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -11,7 +11,7 @@ struct ring_buffer_iter;
   * Don't refer to this struct directly, use functions below.
   */
  struct ring_buffer_event {
-       u32             type:2, len:3, time_delta:27;
+       u32             type_len:5, time_delta:27;
         u32             array[];
  };
  
@@ -24,7 +24,8 @@ struct ring_buffer_event {
   *                               size is variable depending on how much
   *                               padding is needed
   *                              If time_delta is non zero:
- *                               everything else same as RINGBUF_TYPE_DATA
+ *                               array[0] holds the actual length
+ *                               size = 4 + length (bytes)
   *
   * @RINGBUF_TYPE_TIME_EXTEND:  Extend the time delta
   *                              array[0] = time delta (28 .. 59)
@@ -35,22 +36,23 @@ struct ring_buffer_event {
   *                              array[1..2] = tv_sec
   *                              size = 16 bytes
   *
- * @RINGBUF_TYPE_DATA:         Data record
- *                              If len is zero:
+ * <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX:
+ *                             Data record
+ *                              If type_len is zero:
   *                               array[0] holds the actual length
   *                               array[1..(length+3)/4] holds data
- *                               size = 4 + 4 + length (bytes)
+ *                               size = 4 + length (bytes)
   *                              else
- *                               length = len << 2
+ *                               length = type_len << 2
   *                               array[0..(length+3)/4-1] holds data
   *                               size = 4 + length (bytes)
   */
  enum ring_buffer_type {
+       RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28,
         RINGBUF_TYPE_PADDING,
         RINGBUF_TYPE_TIME_EXTEND,
         /* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */
         RINGBUF_TYPE_TIME_STAMP,
-       RINGBUF_TYPE_DATA,
  };
  
  unsigned ring_buffer_event_length(struct ring_buffer_event *event);
@@ -68,13 +70,54 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event)
         return event->time_delta;
  }
  
+/*
+ * ring_buffer_event_discard can discard any event in the ring buffer.
+ *   it is up to the caller to protect against a reader from
+ *   consuming it or a writer from wrapping and replacing it.
+ *
+ * No external protection is needed if this is called before
+ * the event is commited. But in that case it would be better to
+ * use ring_buffer_discard_commit.
+ *
+ * Note, if an event that has not been committed is discarded
+ * with ring_buffer_event_discard, it must still be committed.
+ */
  void ring_buffer_event_discard(struct ring_buffer_event *event);
  
+/*
+ * ring_buffer_discard_commit will remove an event that has not
+ *   ben committed yet. If this is used, then ring_buffer_unlock_commit
+ *   must not be called on the discarded event. This function
+ *   will try to remove the event from the ring buffer completely
+ *   if another event has not been written after it.
+ *
+ * Example use:
+ *
+ *  if (some_condition)
+ *    ring_buffer_discard_commit(buffer, event);
+ *  else
+ *    ring_buffer_unlock_commit(buffer, event);
+ */
+void ring_buffer_discard_commit(struct ring_buffer *buffer,
+                               struct ring_buffer_event *event);
+
  /*
   * size is in bytes for each per CPU buffer.
   */
  struct ring_buffer *
-ring_buffer_alloc(unsigned long size, unsigned flags);
+__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key);
+
+/*
+ * Because the ring buffer is generic, if other users of the ring buffer get
+ * traced by ftrace, it can produce lockdep warnings. We need to keep each
+ * ring buffer's lock class separate.
+ */
+#define ring_buffer_alloc(size, flags)                 \
+({                                                     \
+       static struct lock_class_key __key;             \
+       __ring_buffer_alloc((size), (flags), &__key);   \
+})
+
  void ring_buffer_free(struct ring_buffer *buffer);
  
  int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
@@ -122,6 +165,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer);
  unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
  unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu);
  unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
+unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu);
+unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu);
  
  u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu);
  void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
@@ -137,6 +182,11 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
  int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page,
                           size_t len, int cpu, int full);
  
+struct trace_seq;
+
+int ring_buffer_print_entry_header(struct trace_seq *s);
+int ring_buffer_print_page_header(struct trace_seq *s);
+
  enum ring_buffer_flags {
         RB_FL_OVERWRITE         = 1 << 0,
  };
diff --git a/include/linux/sched.h b/include/linux/sched.h

index b4c38bc8049cbbea17e0ca4f929f35df9cddbe1f..28c774ff3cc7539673835bd2d07d16cf458a2e04 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -77,6 +77,7 @@ struct sched_param {
  #include <linux/proportions.h>
  #include <linux/seccomp.h>
  #include <linux/rcupdate.h>
+#include <linux/rculist.h>
  #include <linux/rtmutex.h>
  
  #include <linux/time.h>
@@ -96,8 +97,9 @@ struct exec_domain;
  struct futex_pi_state;
  struct robust_list_head;
  struct bio;
-struct bts_tracer;
  struct fs_struct;
+struct bts_context;
+struct perf_counter_context;
  
  /*
   * List of flags we want to share for kernel threads,
@@ -116,6 +118,7 @@ struct fs_struct;
   *    11 bit fractions.
   */
  extern unsigned long avenrun[];                /* Load averages */
+extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
  
  #define FSHIFT         11              /* nr of bits of precision */
  #define FIXED_1                (1<<FSHIFT)     /* 1.0 as fixed-point */
@@ -135,8 +138,9 @@ DECLARE_PER_CPU(unsigned long, process_counts);
  extern int nr_processes(void);
  extern unsigned long nr_running(void);
  extern unsigned long nr_uninterruptible(void);
-extern unsigned long nr_active(void);
  extern unsigned long nr_iowait(void);
+extern void calc_global_load(void);
+extern u64 cpu_nr_migrations(int cpu);
  
  extern unsigned long get_parent_ip(unsigned long addr);
  
@@ -672,6 +676,10 @@ struct user_struct {
         struct work_struct work;
  #endif
  #endif
+
+#ifdef CONFIG_PERF_COUNTERS
+       atomic_long_t locked_vm;
+#endif
  };
  
  extern int uids_sysfs_init(void);
@@ -838,7 +846,17 @@ struct sched_group {
          */
         u32 reciprocal_cpu_power;
  
-       unsigned long cpumask[];
+       /*
+        * The CPUs this group covers.
+        *
+        * NOTE: this field is variable length. (Allocated dynamically
+        * by attaching extra space to the end of the structure,
+        * depending on how many CPUs the kernel has booted up with)
+        *
+        * It is also be embedded into static data structures at build
+        * time. (See 'struct static_sched_group' in kernel/sched.c)
+        */
+       unsigned long cpumask[0];
  };
  
  static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
@@ -924,8 +942,17 @@ struct sched_domain {
         char *name;
  #endif
  
-       /* span of all CPUs in this domain */
-       unsigned long span[];
+       /*
+        * Span of all CPUs in this domain.
+        *
+        * NOTE: this field is variable length. (Allocated dynamically
+        * by attaching extra space to the end of the structure,
+        * depending on how many CPUs the kernel has booted up with)
+        *
+        * It is also be embedded into static data structures at build
+        * time. (See 'struct static_sched_domain' in kernel/sched.c)
+        */
+       unsigned long span[0];
  };
  
  static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
@@ -1052,9 +1079,10 @@ struct sched_entity {
         u64                     last_wakeup;
         u64                     avg_overlap;
  
+       u64                     nr_migrations;
+
         u64                     start_runtime;
         u64                     avg_wakeup;
-       u64                     nr_migrations;
  
  #ifdef CONFIG_SCHEDSTATS
         u64                     wait_start;
@@ -1209,18 +1237,11 @@ struct task_struct {
         struct list_head ptraced;
         struct list_head ptrace_entry;
  
-#ifdef CONFIG_X86_PTRACE_BTS
         /*
          * This is the tracer handle for the ptrace BTS extension.
          * This field actually belongs to the ptracer task.
          */
-       struct bts_tracer *bts;
-       /*
-        * The buffer to hold the BTS data.
-        */
-       void *bts_buffer;
-       size_t bts_size;
-#endif /* CONFIG_X86_PTRACE_BTS */
+       struct bts_context *bts;
  
         /* PID/PID hash table linkage. */
         struct pid_link pids[PIDTYPE_MAX];
@@ -1380,6 +1401,11 @@ struct task_struct {
         struct list_head pi_state_list;
         struct futex_pi_state *pi_state_cache;
  #endif
+#ifdef CONFIG_PERF_COUNTERS
+       struct perf_counter_context *perf_counter_ctxp;
+       struct mutex perf_counter_mutex;
+       struct list_head perf_counter_list;
+#endif
  #ifdef CONFIG_NUMA
         struct mempolicy *mempolicy;
         short il_next;
@@ -1428,7 +1454,9 @@ struct task_struct {
  #ifdef CONFIG_TRACING
         /* state flags for use by tracers */
         unsigned long trace;
-#endif
+       /* bitmask of trace recursion */
+       unsigned long trace_recursion;
+#endif /* CONFIG_TRACING */
  };
  
  /* Future-safe accessor for struct task_struct's cpus_allowed. */
@@ -2001,8 +2029,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from);
  extern char *get_task_comm(char *to, struct task_struct *tsk);
  
  #ifdef CONFIG_SMP
+extern void wait_task_context_switch(struct task_struct *p);
  extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
  #else
+static inline void wait_task_context_switch(struct task_struct *p) {}
  static inline unsigned long wait_task_inactive(struct task_struct *p,
                                                long match_state)
  {
@@ -2010,7 +2040,8 @@ static inline unsigned long wait_task_inactive(struct task_struct *p,
  }
  #endif
  
-#define next_task(p)   list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
+#define next_task(p) \
+       list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
  
  #define for_each_process(p) \
         for (p = &init_task ; (p = next_task(p)) != &init_task ; )
@@ -2049,8 +2080,8 @@ int same_thread_group(struct task_struct *p1, struct task_struct *p2)
  
  static inline struct task_struct *next_thread(const struct task_struct *p)
  {
-       return list_entry(rcu_dereference(p->thread_group.next),
-                         struct task_struct, thread_group);
+       return list_entry_rcu(p->thread_group.next,
+                             struct task_struct, thread_group);
  }
  
  static inline int thread_group_empty(struct task_struct *p)
@@ -2388,6 +2419,13 @@ static inline void inc_syscw(struct task_struct *tsk)
  #define TASK_SIZE_OF(tsk)      TASK_SIZE
  #endif
  
+/*
+ * Call the function if the target task is executing on a CPU right now:
+ */
+extern void task_oncpu_function_call(struct task_struct *p,
+                                    void (*func) (void *info), void *info);
+
+
  #ifdef CONFIG_MM_OWNER
  extern void mm_update_next_owner(struct mm_struct *mm);
  extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --git a/include/linux/signal.h b/include/linux/signal.h

index 84f997f8aa53cfcc8b04ff86bb4b0263f0e7dc1a..c7552836bd954a0f0a0235feea5cf9acb5c1a2c8 100644 (file)
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -235,6 +235,8 @@ static inline int valid_signal(unsigned long sig)
  extern int next_signal(struct sigpending *pending, sigset_t *mask);
  extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p);
  extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *);
+extern long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig,
+                                siginfo_t *info);
  extern long do_sigpending(void __user *, unsigned long);
  extern int sigprocmask(int, sigset_t *, sigset_t *);
  extern int show_unhandled_signals;
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h

index 5ac9b0bcaf9adef1fdfddebd2aff6290a44f7b9f..713f841ecaa914e74aead8e4d8ff5d74cb5040d3 100644 (file)
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -14,7 +14,7 @@
  #include <asm/page.h>          /* kmalloc_sizes.h needs PAGE_SIZE */
  #include <asm/cache.h>         /* kmalloc_sizes.h needs L1_CACHE_BYTES */
  #include <linux/compiler.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
  
  /* Size description struct for general caches. */
  struct cache_sizes {
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h

index 5046f90c11710178127a20a7bcb8677b4e80d943..be5d40c43bd2e7aa96264e624b074f4a884826f4 100644 (file)
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,7 +10,7 @@
  #include <linux/gfp.h>
  #include <linux/workqueue.h>
  #include <linux/kobject.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
  
  enum stat_item {
         ALLOC_FASTPATH,         /* Allocation from cpu slab */
diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h

index 938234c4a996ba6e78521ffc60ecc197034fed9d..d4841ed8215b755ae2a291245687187479d4d27c 100644 (file)
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -60,6 +60,7 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock)
  #define __raw_spin_is_locked(lock)     ((void)(lock), 0)
  /* for sched.c and kernel_lock.c: */
  # define __raw_spin_lock(lock)         do { (void)(lock); } while (0)
+# define __raw_spin_lock_flags(lock, flags)    do { (void)(lock); } while (0)
  # define __raw_spin_unlock(lock)       do { (void)(lock); } while (0)
  # define __raw_spin_trylock(lock)      ({ (void)(lock); 1; })
  #endif /* DEBUG_SPINLOCK */
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h

index ac9ff54f7cb305943e5444f27be83460ef398206..cb1a6631b8f449bcf2738f37208783dd23be87ba 100644 (file)
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -29,7 +29,8 @@ extern void *swiotlb_alloc(unsigned order, unsigned long nslabs);
  
  extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev,
                                       phys_addr_t address);
-extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address);
+extern phys_addr_t swiotlb_bus_to_phys(struct device *hwdev,
+                                      dma_addr_t address);
  
  extern int swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size);
  
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h

index 30520844b8dae4de881c042d2b61d68db831681b..c6c84ad8bd7119dea51f73ab13ee34987c5d152d 100644 (file)
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -55,6 +55,7 @@ struct compat_timeval;
  struct robust_list_head;
  struct getcpu_cache;
  struct old_linux_dirent;
+struct perf_counter_attr;
  
  #include <linux/types.h>
  #include <linux/aio_abi.h>
@@ -755,4 +756,8 @@ asmlinkage long sys_pipe(int __user *);
  
  int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
  
+
+asmlinkage long sys_perf_counter_open(
+               const struct perf_counter_attr __user *attr_uptr,
+               pid_t pid, int cpu, int group_fd, unsigned long flags);
  #endif
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h

index e6b820f8b56b8ae3bf50ac980d1e95ce3ad6536e..a8cc4e13434c4261ae8ab536566b23fcd777286a 100644 (file)
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -21,13 +21,14 @@ struct restart_block {
                 struct {
                         unsigned long arg0, arg1, arg2, arg3;
                 };
-               /* For futex_wait */
+               /* For futex_wait and futex_wait_requeue_pi */
                 struct {
                         u32 *uaddr;
                         u32 val;
                         u32 flags;
                         u32 bitset;
                         u64 time;
+                       u32 *uaddr2;
                 } futex;
                 /* For nanosleep */
                 struct {
diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h

new file mode 100644 (file)

index 0000000..c68bccb
--- /dev/null
+++ b/include/linux/trace_seq.h
@@ -0,0 +1,92 @@
+#ifndef _LINUX_TRACE_SEQ_H
+#define _LINUX_TRACE_SEQ_H
+
+#include <linux/fs.h>
+
+/*
+ * Trace sequences are used to allow a function to call several other functions
+ * to create a string of data to use (up to a max of PAGE_SIZE.
+ */
+
+struct trace_seq {
+       unsigned char           buffer[PAGE_SIZE];
+       unsigned int            len;
+       unsigned int            readpos;
+};
+
+static inline void
+trace_seq_init(struct trace_seq *s)
+{
+       s->len = 0;
+       s->readpos = 0;
+}
+
+/*
+ * Currently only defined when tracing is enabled.
+ */
+#ifdef CONFIG_TRACING
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+       __attribute__ ((format (printf, 2, 3)));
+extern int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+       __attribute__ ((format (printf, 2, 0)));
+extern int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
+extern void trace_print_seq(struct seq_file *m, struct trace_seq *s);
+extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+                                size_t cnt);
+extern int trace_seq_puts(struct trace_seq *s, const char *str);
+extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
+extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
+extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+                               size_t len);
+extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
+extern int trace_seq_path(struct trace_seq *s, struct path *path);
+
+#else /* CONFIG_TRACING */
+static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+       return 0;
+}
+static inline int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+{
+       return 0;
+}
+
+static inline void trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+}
+static inline ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+                                size_t cnt)
+{
+       return 0;
+}
+static inline int trace_seq_puts(struct trace_seq *s, const char *str)
+{
+       return 0;
+}
+static inline int trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+       return 0;
+}
+static inline int
+trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
+{
+       return 0;
+}
+static inline int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+                                      size_t len)
+{
+       return 0;
+}
+static inline void *trace_seq_reserve(struct trace_seq *s, size_t len)
+{
+       return NULL;
+}
+static inline int trace_seq_path(struct trace_seq *s, struct path *path)
+{
+       return 0;
+}
+#endif /* CONFIG_TRACING */
+
+#endif /* _LINUX_TRACE_SEQ_H */
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h

index d35a7ee7611fe1025ed5ccaf01c6ea614c03d688..14df7e635d439e07d438e9890f0c8f718fa2aa15 100644 (file)
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -31,6 +31,8 @@ struct tracepoint {
                                          * Keep in sync with vmlinux.lds.h.
                                          */
  
+#ifndef DECLARE_TRACE
+
  #define TP_PROTO(args...)      args
  #define TP_ARGS(args...)               args
  
@@ -114,6 +116,7 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin,
         struct tracepoint *end)
  { }
  #endif /* CONFIG_TRACEPOINTS */
+#endif /* DECLARE_TRACE */
  
  /*
   * Connect a probe to a tracepoint.
@@ -154,10 +157,8 @@ static inline void tracepoint_synchronize_unregister(void)
  }
  
  #define PARAMS(args...) args
-#define TRACE_FORMAT(name, proto, args, fmt)           \
-       DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
-
  
+#ifndef TRACE_EVENT
  /*
   * For use with the TRACE_EVENT macro:
   *
@@ -262,5 +263,6 @@ static inline void tracepoint_synchronize_unregister(void)
  
  #define TRACE_EVENT(name, proto, args, struct, assign, print)  \
         DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#endif
  
  #endif
diff --git a/include/linux/wait.h b/include/linux/wait.h

index bc024632f365915f99894f5d799938f5e4e69b45..6788e1a4d4ca63e8cfc3b288f87d456334064c29 100644 (file)
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -132,8 +132,6 @@ static inline void __remove_wait_queue(wait_queue_head_t *head,
         list_del(&old->task_list);
  }
  
-void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-                       int nr_exclusive, int sync, void *key);
  void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
  void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
  void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr,
diff --git a/include/trace/block.h b/include/trace/block.h

deleted file mode 100644 (file)

index 25b7068..0000000
--- a/include/trace/block.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef _TRACE_BLOCK_H
-#define _TRACE_BLOCK_H
-
-#include <linux/blkdev.h>
-#include <linux/tracepoint.h>
-
-DECLARE_TRACE(block_rq_abort,
-       TP_PROTO(struct request_queue *q, struct request *rq),
-             TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_insert,
-       TP_PROTO(struct request_queue *q, struct request *rq),
-             TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_issue,
-       TP_PROTO(struct request_queue *q, struct request *rq),
-             TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_requeue,
-       TP_PROTO(struct request_queue *q, struct request *rq),
-             TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_complete,
-       TP_PROTO(struct request_queue *q, struct request *rq),
-             TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_bio_bounce,
-       TP_PROTO(struct request_queue *q, struct bio *bio),
-             TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_complete,
-       TP_PROTO(struct request_queue *q, struct bio *bio),
-             TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_backmerge,
-       TP_PROTO(struct request_queue *q, struct bio *bio),
-             TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_frontmerge,
-       TP_PROTO(struct request_queue *q, struct bio *bio),
-             TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_queue,
-       TP_PROTO(struct request_queue *q, struct bio *bio),
-             TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_getrq,
-       TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
-             TP_ARGS(q, bio, rw));
-
-DECLARE_TRACE(block_sleeprq,
-       TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
-             TP_ARGS(q, bio, rw));
-
-DECLARE_TRACE(block_plug,
-       TP_PROTO(struct request_queue *q),
-             TP_ARGS(q));
-
-DECLARE_TRACE(block_unplug_timer,
-       TP_PROTO(struct request_queue *q),
-             TP_ARGS(q));
-
-DECLARE_TRACE(block_unplug_io,
-       TP_PROTO(struct request_queue *q),
-             TP_ARGS(q));
-
-DECLARE_TRACE(block_split,
-       TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
-             TP_ARGS(q, bio, pdu));
-
-DECLARE_TRACE(block_remap,
-       TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
-                sector_t from, sector_t to),
-             TP_ARGS(q, bio, dev, from, to));
-
-#endif
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h

new file mode 100644 (file)

index 0000000..f7a7ae1
--- /dev/null
+++ b/include/trace/define_trace.h
@@ -0,0 +1,75 @@
+/*
+ * Trace files that want to automate creationg of all tracepoints defined
+ * in their file should include this file. The following are macros that the
+ * trace file may define:
+ *
+ * TRACE_SYSTEM defines the system the tracepoint is for
+ *
+ * TRACE_INCLUDE_FILE if the file name is something other than TRACE_SYSTEM.h
+ *     This macro may be defined to tell define_trace.h what file to include.
+ *     Note, leave off the ".h".
+ *
+ * TRACE_INCLUDE_PATH if the path is something other than core kernel include/trace
+ *     then this macro can define the path to use. Note, the path is relative to
+ *     define_trace.h, not the file including it. Full path names for out of tree
+ *     modules must be used.
+ */
+
+#ifdef CREATE_TRACE_POINTS
+
+/* Prevent recursion */
+#undef CREATE_TRACE_POINTS
+
+#include <linux/stringify.h>
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
+       DEFINE_TRACE(name)
+
+#undef DECLARE_TRACE
+#define DECLARE_TRACE(name, proto, args)       \
+       DEFINE_TRACE(name)
+
+#undef TRACE_INCLUDE
+#undef __TRACE_INCLUDE
+
+#ifndef TRACE_INCLUDE_FILE
+# define TRACE_INCLUDE_FILE TRACE_SYSTEM
+# define UNDEF_TRACE_INCLUDE_FILE
+#endif
+
+#ifndef TRACE_INCLUDE_PATH
+# define __TRACE_INCLUDE(system) <trace/events/system.h>
+# define UNDEF_TRACE_INCLUDE_PATH
+#else
+# define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h)
+#endif
+
+# define TRACE_INCLUDE(system) __TRACE_INCLUDE(system)
+
+/* Let the trace headers be reread */
+#define TRACE_HEADER_MULTI_READ
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+#ifdef CONFIG_EVENT_TRACING
+#include <trace/ftrace.h>
+#endif
+
+#undef TRACE_HEADER_MULTI_READ
+
+/* Only undef what we defined in this file */
+#ifdef UNDEF_TRACE_INCLUDE_FILE
+# undef TRACE_INCLUDE_FILE
+# undef UNDEF_TRACE_INCLUDE_FILE
+#endif
+
+#ifdef UNDEF_TRACE_INCLUDE_PATH
+# undef TRACE_INCLUDE_PATH
+# undef UNDEF_TRACE_INCLUDE_PATH
+#endif
+
+/* We may be processing more files */
+#define CREATE_TRACE_POINTS
+
+#endif /* CREATE_TRACE_POINTS */
diff --git a/include/trace/events/block.h b/include/trace/events/block.h

new file mode 100644 (file)

index 0000000..53effd4
--- /dev/null
+++ b/include/trace/events/block.h
@@ -0,0 +1,498 @@
+#if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BLOCK_H
+
+#include <linux/blktrace_api.h>
+#include <linux/blkdev.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM block
+
+TRACE_EVENT(block_rq_abort,
+
+       TP_PROTO(struct request_queue *q, struct request *rq),
+
+       TP_ARGS(q, rq),
+
+       TP_STRUCT__entry(
+               __field(  dev_t,        dev                     )
+               __field(  sector_t,     sector                  )
+               __field(  unsigned int, nr_sector               )
+               __field(  int,          errors                  )
+               __array(  char,         rwbs,   6               )
+               __dynamic_array( char,  cmd,    blk_cmd_buf_len(rq)     )
+       ),
+
+       TP_fast_assign(
+               __entry->dev       = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+               __entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+               __entry->nr_sector = blk_pc_request(rq) ?
+                                               0 : rq->hard_nr_sectors;
+               __entry->errors    = rq->errors;
+
+               blk_fill_rwbs_rq(__entry->rwbs, rq);
+               blk_dump_cmd(__get_str(cmd), rq);
+       ),
+
+       TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->rwbs, __get_str(cmd),
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->errors)
+);
+
+TRACE_EVENT(block_rq_insert,
+
+       TP_PROTO(struct request_queue *q, struct request *rq),
+
+       TP_ARGS(q, rq),
+
+       TP_STRUCT__entry(
+               __field(  dev_t,        dev                     )
+               __field(  sector_t,     sector                  )
+               __field(  unsigned int, nr_sector               )
+               __field(  unsigned int, bytes                   )
+               __array(  char,         rwbs,   6               )
+               __array(  char,         comm,   TASK_COMM_LEN   )
+               __dynamic_array( char,  cmd,    blk_cmd_buf_len(rq)     )
+       ),
+
+       TP_fast_assign(
+               __entry->dev       = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+               __entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+               __entry->nr_sector = blk_pc_request(rq) ?
+                                               0 : rq->hard_nr_sectors;
+               __entry->bytes     = blk_pc_request(rq) ? rq->data_len : 0;
+
+               blk_fill_rwbs_rq(__entry->rwbs, rq);
+               blk_dump_cmd(__get_str(cmd), rq);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->rwbs, __entry->bytes, __get_str(cmd),
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_rq_issue,
+
+       TP_PROTO(struct request_queue *q, struct request *rq),
+
+       TP_ARGS(q, rq),
+
+       TP_STRUCT__entry(
+               __field(  dev_t,        dev                     )
+               __field(  sector_t,     sector                  )
+               __field(  unsigned int, nr_sector               )
+               __field(  unsigned int, bytes                   )
+               __array(  char,         rwbs,   6               )
+               __array(  char,         comm,   TASK_COMM_LEN   )
+               __dynamic_array( char,  cmd,    blk_cmd_buf_len(rq)     )
+       ),
+
+       TP_fast_assign(
+               __entry->dev       = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+               __entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+               __entry->nr_sector = blk_pc_request(rq) ?
+                                               0 : rq->hard_nr_sectors;
+               __entry->bytes     = blk_pc_request(rq) ? rq->data_len : 0;
+
+               blk_fill_rwbs_rq(__entry->rwbs, rq);
+               blk_dump_cmd(__get_str(cmd), rq);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->rwbs, __entry->bytes, __get_str(cmd),
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_rq_requeue,
+
+       TP_PROTO(struct request_queue *q, struct request *rq),
+
+       TP_ARGS(q, rq),
+
+       TP_STRUCT__entry(
+               __field(  dev_t,        dev                     )
+               __field(  sector_t,     sector                  )
+               __field(  unsigned int, nr_sector               )
+               __field(  int,          errors                  )
+               __array(  char,         rwbs,   6               )
+               __dynamic_array( char,  cmd,    blk_cmd_buf_len(rq)     )
+       ),
+
+       TP_fast_assign(
+               __entry->dev       = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+               __entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+               __entry->nr_sector = blk_pc_request(rq) ?
+                                               0 : rq->hard_nr_sectors;
+               __entry->errors    = rq->errors;
+
+               blk_fill_rwbs_rq(__entry->rwbs, rq);
+               blk_dump_cmd(__get_str(cmd), rq);
+       ),
+
+       TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->rwbs, __get_str(cmd),
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->errors)
+);
+
+TRACE_EVENT(block_rq_complete,
+
+       TP_PROTO(struct request_queue *q, struct request *rq),
+
+       TP_ARGS(q, rq),
+
+       TP_STRUCT__entry(
+               __field(  dev_t,        dev                     )
+               __field(  sector_t,     sector                  )
+               __field(  unsigned int, nr_sector               )
+               __field(  int,          errors                  )
+               __array(  char,         rwbs,   6               )
+               __dynamic_array( char,  cmd,    blk_cmd_buf_len(rq)     )
+       ),
+
+       TP_fast_assign(
+               __entry->dev       = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+               __entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+               __entry->nr_sector = blk_pc_request(rq) ?
+                                               0 : rq->hard_nr_sectors;
+               __entry->errors    = rq->errors;
+
+               blk_fill_rwbs_rq(__entry->rwbs, rq);
+               blk_dump_cmd(__get_str(cmd), rq);
+       ),
+
+       TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->rwbs, __get_str(cmd),
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->errors)
+);
+TRACE_EVENT(block_bio_bounce,
+
+       TP_PROTO(struct request_queue *q, struct bio *bio),
+
+       TP_ARGS(q, bio),
+
+       TP_STRUCT__entry(
+               __field( dev_t,         dev                     )
+               __field( sector_t,      sector                  )
+               __field( unsigned int,  nr_sector               )
+               __array( char,          rwbs,   6               )
+               __array( char,          comm,   TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio->bi_bdev->bd_dev;
+               __entry->sector         = bio->bi_sector;
+               __entry->nr_sector      = bio->bi_size >> 9;
+               blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("%d,%d %s %llu + %u [%s]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_complete,
+
+       TP_PROTO(struct request_queue *q, struct bio *bio),
+
+       TP_ARGS(q, bio),
+
+       TP_STRUCT__entry(
+               __field( dev_t,         dev             )
+               __field( sector_t,      sector          )
+               __field( unsigned,      nr_sector       )
+               __field( int,           error           )
+               __array( char,          rwbs,   6       )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio->bi_bdev->bd_dev;
+               __entry->sector         = bio->bi_sector;
+               __entry->nr_sector      = bio->bi_size >> 9;
+               blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+       ),
+
+       TP_printk("%d,%d %s %llu + %u [%d]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->error)
+);
+
+TRACE_EVENT(block_bio_backmerge,
+
+       TP_PROTO(struct request_queue *q, struct bio *bio),
+
+       TP_ARGS(q, bio),
+
+       TP_STRUCT__entry(
+               __field( dev_t,         dev                     )
+               __field( sector_t,      sector                  )
+               __field( unsigned int,  nr_sector               )
+               __array( char,          rwbs,   6               )
+               __array( char,          comm,   TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio->bi_bdev->bd_dev;
+               __entry->sector         = bio->bi_sector;
+               __entry->nr_sector      = bio->bi_size >> 9;
+               blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("%d,%d %s %llu + %u [%s]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_frontmerge,
+
+       TP_PROTO(struct request_queue *q, struct bio *bio),
+
+       TP_ARGS(q, bio),
+
+       TP_STRUCT__entry(
+               __field( dev_t,         dev                     )
+               __field( sector_t,      sector                  )
+               __field( unsigned,      nr_sector               )
+               __array( char,          rwbs,   6               )
+               __array( char,          comm,   TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio->bi_bdev->bd_dev;
+               __entry->sector         = bio->bi_sector;
+               __entry->nr_sector      = bio->bi_size >> 9;
+               blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("%d,%d %s %llu + %u [%s]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_queue,
+
+       TP_PROTO(struct request_queue *q, struct bio *bio),
+
+       TP_ARGS(q, bio),
+
+       TP_STRUCT__entry(
+               __field( dev_t,         dev                     )
+               __field( sector_t,      sector                  )
+               __field( unsigned int,  nr_sector               )
+               __array( char,          rwbs,   6               )
+               __array( char,          comm,   TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio->bi_bdev->bd_dev;
+               __entry->sector         = bio->bi_sector;
+               __entry->nr_sector      = bio->bi_size >> 9;
+               blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("%d,%d %s %llu + %u [%s]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_getrq,
+
+       TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+
+       TP_ARGS(q, bio, rw),
+
+       TP_STRUCT__entry(
+               __field( dev_t,         dev                     )
+               __field( sector_t,      sector                  )
+               __field( unsigned int,  nr_sector               )
+               __array( char,          rwbs,   6               )
+               __array( char,          comm,   TASK_COMM_LEN   )
+        ),
+
+       TP_fast_assign(
+               __entry->dev            = bio ? bio->bi_bdev->bd_dev : 0;
+               __entry->sector         = bio ? bio->bi_sector : 0;
+               __entry->nr_sector      = bio ? bio->bi_size >> 9 : 0;
+               blk_fill_rwbs(__entry->rwbs,
+                             bio ? bio->bi_rw : 0, __entry->nr_sector);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+        ),
+
+       TP_printk("%d,%d %s %llu + %u [%s]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_sleeprq,
+
+       TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+
+       TP_ARGS(q, bio, rw),
+
+       TP_STRUCT__entry(
+               __field( dev_t,         dev                     )
+               __field( sector_t,      sector                  )
+               __field( unsigned int,  nr_sector               )
+               __array( char,          rwbs,   6               )
+               __array( char,          comm,   TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio ? bio->bi_bdev->bd_dev : 0;
+               __entry->sector         = bio ? bio->bi_sector : 0;
+               __entry->nr_sector      = bio ? bio->bi_size >> 9 : 0;
+               blk_fill_rwbs(__entry->rwbs,
+                           bio ? bio->bi_rw : 0, __entry->nr_sector);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("%d,%d %s %llu + %u [%s]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_plug,
+
+       TP_PROTO(struct request_queue *q),
+
+       TP_ARGS(q),
+
+       TP_STRUCT__entry(
+               __array( char,          comm,   TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("[%s]", __entry->comm)
+);
+
+TRACE_EVENT(block_unplug_timer,
+
+       TP_PROTO(struct request_queue *q),
+
+       TP_ARGS(q),
+
+       TP_STRUCT__entry(
+               __field( int,           nr_rq                   )
+               __array( char,          comm,   TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               __entry->nr_rq  = q->rq.count[READ] + q->rq.count[WRITE];
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+);
+
+TRACE_EVENT(block_unplug_io,
+
+       TP_PROTO(struct request_queue *q),
+
+       TP_ARGS(q),
+
+       TP_STRUCT__entry(
+               __field( int,           nr_rq                   )
+               __array( char,          comm,   TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               __entry->nr_rq  = q->rq.count[READ] + q->rq.count[WRITE];
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+);
+
+TRACE_EVENT(block_split,
+
+       TP_PROTO(struct request_queue *q, struct bio *bio,
+                unsigned int new_sector),
+
+       TP_ARGS(q, bio, new_sector),
+
+       TP_STRUCT__entry(
+               __field( dev_t,         dev                             )
+               __field( sector_t,      sector                          )
+               __field( sector_t,      new_sector                      )
+               __array( char,          rwbs,           6               )
+               __array( char,          comm,           TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio->bi_bdev->bd_dev;
+               __entry->sector         = bio->bi_sector;
+               __entry->new_sector     = new_sector;
+               blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("%d,%d %s %llu / %llu [%s]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 (unsigned long long)__entry->new_sector,
+                 __entry->comm)
+);
+
+TRACE_EVENT(block_remap,
+
+       TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
+                sector_t from),
+
+       TP_ARGS(q, bio, dev, from),
+
+       TP_STRUCT__entry(
+               __field( dev_t,         dev             )
+               __field( sector_t,      sector          )
+               __field( unsigned int,  nr_sector       )
+               __field( dev_t,         old_dev         )
+               __field( sector_t,      old_sector      )
+               __array( char,          rwbs,   6       )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio->bi_bdev->bd_dev;
+               __entry->sector         = bio->bi_sector;
+               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->old_dev        = dev;
+               __entry->old_sector     = from;
+               blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+       ),
+
+       TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector,
+                 MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
+                 (unsigned long long)__entry->old_sector)
+);
+
+#endif /* _TRACE_BLOCK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+
diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h

new file mode 100644 (file)

index 0000000..b0c7ede
--- /dev/null
+++ b/include/trace/events/irq.h
@@ -0,0 +1,145 @@
+#if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IRQ_H
+
+#include <linux/tracepoint.h>
+#include <linux/interrupt.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM irq
+
+#define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq }
+#define show_softirq_name(val)                 \
+       __print_symbolic(val,                   \
+                        softirq_name(HI),      \
+                        softirq_name(TIMER),   \
+                        softirq_name(NET_TX),  \
+                        softirq_name(NET_RX),  \
+                        softirq_name(BLOCK),   \
+                        softirq_name(TASKLET), \
+                        softirq_name(SCHED),   \
+                        softirq_name(HRTIMER), \
+                        softirq_name(RCU))
+
+/**
+ * irq_handler_entry - called immediately before the irq action handler
+ * @irq: irq number
+ * @action: pointer to struct irqaction
+ *
+ * The struct irqaction pointed to by @action contains various
+ * information about the handler, including the device name,
+ * @action->name, and the device id, @action->dev_id. When used in
+ * conjunction with the irq_handler_exit tracepoint, we can figure
+ * out irq handler latencies.
+ */
+TRACE_EVENT(irq_handler_entry,
+
+       TP_PROTO(int irq, struct irqaction *action),
+
+       TP_ARGS(irq, action),
+
+       TP_STRUCT__entry(
+               __field(        int,    irq             )
+               __string(       name,   action->name    )
+       ),
+
+       TP_fast_assign(
+               __entry->irq = irq;
+               __assign_str(name, action->name);
+       ),
+
+       TP_printk("irq=%d handler=%s", __entry->irq, __get_str(name))
+);
+
+/**
+ * irq_handler_exit - called immediately after the irq action handler returns
+ * @irq: irq number
+ * @action: pointer to struct irqaction
+ * @ret: return value
+ *
+ * If the @ret value is set to IRQ_HANDLED, then we know that the corresponding
+ * @action->handler scuccessully handled this irq. Otherwise, the irq might be
+ * a shared irq line, or the irq was not handled successfully. Can be used in
+ * conjunction with the irq_handler_entry to understand irq handler latencies.
+ */
+TRACE_EVENT(irq_handler_exit,
+
+       TP_PROTO(int irq, struct irqaction *action, int ret),
+
+       TP_ARGS(irq, action, ret),
+
+       TP_STRUCT__entry(
+               __field(        int,    irq     )
+               __field(        int,    ret     )
+       ),
+
+       TP_fast_assign(
+               __entry->irq    = irq;
+               __entry->ret    = ret;
+       ),
+
+       TP_printk("irq=%d return=%s",
+                 __entry->irq, __entry->ret ? "handled" : "unhandled")
+);
+
+/**
+ * softirq_entry - called immediately before the softirq handler
+ * @h: pointer to struct softirq_action
+ * @vec: pointer to first struct softirq_action in softirq_vec array
+ *
+ * The @h parameter, contains a pointer to the struct softirq_action
+ * which has a pointer to the action handler that is called. By subtracting
+ * the @vec pointer from the @h pointer, we can determine the softirq
+ * number. Also, when used in combination with the softirq_exit tracepoint
+ * we can determine the softirq latency.
+ */
+TRACE_EVENT(softirq_entry,
+
+       TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+
+       TP_ARGS(h, vec),
+
+       TP_STRUCT__entry(
+               __field(        int,    vec                     )
+       ),
+
+       TP_fast_assign(
+               __entry->vec = (int)(h - vec);
+       ),
+
+       TP_printk("softirq=%d action=%s", __entry->vec,
+                 show_softirq_name(__entry->vec))
+);
+
+/**
+ * softirq_exit - called immediately after the softirq handler returns
+ * @h: pointer to struct softirq_action
+ * @vec: pointer to first struct softirq_action in softirq_vec array
+ *
+ * The @h parameter contains a pointer to the struct softirq_action
+ * that has handled the softirq. By subtracting the @vec pointer from
+ * the @h pointer, we can determine the softirq number. Also, when used in
+ * combination with the softirq_entry tracepoint we can determine the softirq
+ * latency.
+ */
+TRACE_EVENT(softirq_exit,
+
+       TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+
+       TP_ARGS(h, vec),
+
+       TP_STRUCT__entry(
+               __field(        int,    vec                     )
+       ),
+
+       TP_fast_assign(
+               __entry->vec = (int)(h - vec);
+       ),
+
+       TP_printk("softirq=%d action=%s", __entry->vec,
+                 show_softirq_name(__entry->vec))
+);
+
+#endif /*  _TRACE_IRQ_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h

new file mode 100644 (file)

index 0000000..9baba50
--- /dev/null
+++ b/include/trace/events/kmem.h
@@ -0,0 +1,231 @@
+#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KMEM_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kmem
+
+/*
+ * The order of these masks is important. Matching masks will be seen
+ * first and the left over flags will end up showing by themselves.
+ *
+ * For example, if we have GFP_KERNEL before GFP_USER we wil get:
+ *
+ *  GFP_KERNEL|GFP_HARDWALL
+ *
+ * Thus most bits set go first.
+ */
+#define show_gfp_flags(flags)                                          \
+       (flags) ? __print_flags(flags, "|",                             \
+       {(unsigned long)GFP_HIGHUSER_MOVABLE,   "GFP_HIGHUSER_MOVABLE"}, \
+       {(unsigned long)GFP_HIGHUSER,           "GFP_HIGHUSER"},        \
+       {(unsigned long)GFP_USER,               "GFP_USER"},            \
+       {(unsigned long)GFP_TEMPORARY,          "GFP_TEMPORARY"},       \
+       {(unsigned long)GFP_KERNEL,             "GFP_KERNEL"},          \
+       {(unsigned long)GFP_NOFS,               "GFP_NOFS"},            \
+       {(unsigned long)GFP_ATOMIC,             "GFP_ATOMIC"},          \
+       {(unsigned long)GFP_NOIO,               "GFP_NOIO"},            \
+       {(unsigned long)__GFP_HIGH,             "GFP_HIGH"},            \
+       {(unsigned long)__GFP_WAIT,             "GFP_WAIT"},            \
+       {(unsigned long)__GFP_IO,               "GFP_IO"},              \
+       {(unsigned long)__GFP_COLD,             "GFP_COLD"},            \
+       {(unsigned long)__GFP_NOWARN,           "GFP_NOWARN"},          \
+       {(unsigned long)__GFP_REPEAT,           "GFP_REPEAT"},          \
+       {(unsigned long)__GFP_NOFAIL,           "GFP_NOFAIL"},          \
+       {(unsigned long)__GFP_NORETRY,          "GFP_NORETRY"},         \
+       {(unsigned long)__GFP_COMP,             "GFP_COMP"},            \
+       {(unsigned long)__GFP_ZERO,             "GFP_ZERO"},            \
+       {(unsigned long)__GFP_NOMEMALLOC,       "GFP_NOMEMALLOC"},      \
+       {(unsigned long)__GFP_HARDWALL,         "GFP_HARDWALL"},        \
+       {(unsigned long)__GFP_THISNODE,         "GFP_THISNODE"},        \
+       {(unsigned long)__GFP_RECLAIMABLE,      "GFP_RECLAIMABLE"},     \
+       {(unsigned long)__GFP_MOVABLE,          "GFP_MOVABLE"}          \
+       ) : "GFP_NOWAIT"
+
+TRACE_EVENT(kmalloc,
+
+       TP_PROTO(unsigned long call_site,
+                const void *ptr,
+                size_t bytes_req,
+                size_t bytes_alloc,
+                gfp_t gfp_flags),
+
+       TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  call_site       )
+               __field(        const void *,   ptr             )
+               __field(        size_t,         bytes_req       )
+               __field(        size_t,         bytes_alloc     )
+               __field(        gfp_t,          gfp_flags       )
+       ),
+
+       TP_fast_assign(
+               __entry->call_site      = call_site;
+               __entry->ptr            = ptr;
+               __entry->bytes_req      = bytes_req;
+               __entry->bytes_alloc    = bytes_alloc;
+               __entry->gfp_flags      = gfp_flags;
+       ),
+
+       TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s",
+               __entry->call_site,
+               __entry->ptr,
+               __entry->bytes_req,
+               __entry->bytes_alloc,
+               show_gfp_flags(__entry->gfp_flags))
+);
+
+TRACE_EVENT(kmem_cache_alloc,
+
+       TP_PROTO(unsigned long call_site,
+                const void *ptr,
+                size_t bytes_req,
+                size_t bytes_alloc,
+                gfp_t gfp_flags),
+
+       TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  call_site       )
+               __field(        const void *,   ptr             )
+               __field(        size_t,         bytes_req       )
+               __field(        size_t,         bytes_alloc     )
+               __field(        gfp_t,          gfp_flags       )
+       ),
+
+       TP_fast_assign(
+               __entry->call_site      = call_site;
+               __entry->ptr            = ptr;
+               __entry->bytes_req      = bytes_req;
+               __entry->bytes_alloc    = bytes_alloc;
+               __entry->gfp_flags      = gfp_flags;
+       ),
+
+       TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s",
+               __entry->call_site,
+               __entry->ptr,
+               __entry->bytes_req,
+               __entry->bytes_alloc,
+               show_gfp_flags(__entry->gfp_flags))
+);
+
+TRACE_EVENT(kmalloc_node,
+
+       TP_PROTO(unsigned long call_site,
+                const void *ptr,
+                size_t bytes_req,
+                size_t bytes_alloc,
+                gfp_t gfp_flags,
+                int node),
+
+       TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  call_site       )
+               __field(        const void *,   ptr             )
+               __field(        size_t,         bytes_req       )
+               __field(        size_t,         bytes_alloc     )
+               __field(        gfp_t,          gfp_flags       )
+               __field(        int,            node            )
+       ),
+
+       TP_fast_assign(
+               __entry->call_site      = call_site;
+               __entry->ptr            = ptr;
+               __entry->bytes_req      = bytes_req;
+               __entry->bytes_alloc    = bytes_alloc;
+               __entry->gfp_flags      = gfp_flags;
+               __entry->node           = node;
+       ),
+
+       TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d",
+               __entry->call_site,
+               __entry->ptr,
+               __entry->bytes_req,
+               __entry->bytes_alloc,
+               show_gfp_flags(__entry->gfp_flags),
+               __entry->node)
+);
+
+TRACE_EVENT(kmem_cache_alloc_node,
+
+       TP_PROTO(unsigned long call_site,
+                const void *ptr,
+                size_t bytes_req,
+                size_t bytes_alloc,
+                gfp_t gfp_flags,
+                int node),
+
+       TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  call_site       )
+               __field(        const void *,   ptr             )
+               __field(        size_t,         bytes_req       )
+               __field(        size_t,         bytes_alloc     )
+               __field(        gfp_t,          gfp_flags       )
+               __field(        int,            node            )
+       ),
+
+       TP_fast_assign(
+               __entry->call_site      = call_site;
+               __entry->ptr            = ptr;
+               __entry->bytes_req      = bytes_req;
+               __entry->bytes_alloc    = bytes_alloc;
+               __entry->gfp_flags      = gfp_flags;
+               __entry->node           = node;
+       ),
+
+       TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d",
+               __entry->call_site,
+               __entry->ptr,
+               __entry->bytes_req,
+               __entry->bytes_alloc,
+               show_gfp_flags(__entry->gfp_flags),
+               __entry->node)
+);
+
+TRACE_EVENT(kfree,
+
+       TP_PROTO(unsigned long call_site, const void *ptr),
+
+       TP_ARGS(call_site, ptr),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  call_site       )
+               __field(        const void *,   ptr             )
+       ),
+
+       TP_fast_assign(
+               __entry->call_site      = call_site;
+               __entry->ptr            = ptr;
+       ),
+
+       TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+
+TRACE_EVENT(kmem_cache_free,
+
+       TP_PROTO(unsigned long call_site, const void *ptr),
+
+       TP_ARGS(call_site, ptr),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  call_site       )
+               __field(        const void *,   ptr             )
+       ),
+
+       TP_fast_assign(
+               __entry->call_site      = call_site;
+               __entry->ptr            = ptr;
+       ),
+
+       TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+#endif /* _TRACE_KMEM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/lockdep.h b/include/trace/events/lockdep.h

new file mode 100644 (file)

index 0000000..0e956c9
--- /dev/null
+++ b/include/trace/events/lockdep.h
@@ -0,0 +1,96 @@
+#if !defined(_TRACE_LOCKDEP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_LOCKDEP_H
+
+#include <linux/lockdep.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM lockdep
+
+#ifdef CONFIG_LOCKDEP
+
+TRACE_EVENT(lock_acquire,
+
+       TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
+               int trylock, int read, int check,
+               struct lockdep_map *next_lock, unsigned long ip),
+
+       TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
+
+       TP_STRUCT__entry(
+               __field(unsigned int, flags)
+               __string(name, lock->name)
+       ),
+
+       TP_fast_assign(
+               __entry->flags = (trylock ? 1 : 0) | (read ? 2 : 0);
+               __assign_str(name, lock->name);
+       ),
+
+       TP_printk("%s%s%s", (__entry->flags & 1) ? "try " : "",
+                 (__entry->flags & 2) ? "read " : "",
+                 __get_str(name))
+);
+
+TRACE_EVENT(lock_release,
+
+       TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+
+       TP_ARGS(lock, nested, ip),
+
+       TP_STRUCT__entry(
+               __string(name, lock->name)
+       ),
+
+       TP_fast_assign(
+               __assign_str(name, lock->name);
+       ),
+
+       TP_printk("%s", __get_str(name))
+);
+
+#ifdef CONFIG_LOCK_STAT
+
+TRACE_EVENT(lock_contended,
+
+       TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+
+       TP_ARGS(lock, ip),
+
+       TP_STRUCT__entry(
+               __string(name, lock->name)
+       ),
+
+       TP_fast_assign(
+               __assign_str(name, lock->name);
+       ),
+
+       TP_printk("%s", __get_str(name))
+);
+
+TRACE_EVENT(lock_acquired,
+       TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
+
+       TP_ARGS(lock, ip, waittime),
+
+       TP_STRUCT__entry(
+               __string(name, lock->name)
+               __field(unsigned long, wait_usec)
+               __field(unsigned long, wait_nsec_rem)
+       ),
+       TP_fast_assign(
+               __assign_str(name, lock->name);
+               __entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC);
+               __entry->wait_usec = (unsigned long) waittime;
+       ),
+       TP_printk("%s (%lu.%03lu us)", __get_str(name), __entry->wait_usec,
+                                      __entry->wait_nsec_rem)
+);
+
+#endif
+#endif
+
+#endif /* _TRACE_LOCKDEP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/sched_event_types.h b/include/trace/events/sched.h

similarity index 91%

rename from include/trace/sched_event_types.h

rename to include/trace/events/sched.h

index 63547dc1125f7dc59bcc38cf55b92a253e178353..24ab5bcff7b2bfaebfe7ed4d544f286c43c416a3 100644 (file)
--- a/include/trace/sched_event_types.h
+++ b/include/trace/events/sched.h
@@ -1,9 +1,8 @@
+#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SCHED_H
  
-/* use <trace/sched.h> instead */
-#ifndef TRACE_EVENT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
  
  #undef TRACE_SYSTEM
  #define TRACE_SYSTEM sched
@@ -157,6 +156,7 @@ TRACE_EVENT(sched_switch,
                 __array(        char,   prev_comm,      TASK_COMM_LEN   )
                 __field(        pid_t,  prev_pid                        )
                 __field(        int,    prev_prio                       )
+               __field(        long,   prev_state                      )
                 __array(        char,   next_comm,      TASK_COMM_LEN   )
                 __field(        pid_t,  next_pid                        )
                 __field(        int,    next_prio                       )
@@ -166,13 +166,19 @@ TRACE_EVENT(sched_switch,
                 memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
                 __entry->prev_pid       = prev->pid;
                 __entry->prev_prio      = prev->prio;
+               __entry->prev_state     = prev->state;
                 memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
                 __entry->next_pid       = next->pid;
                 __entry->next_prio      = next->prio;
         ),
  
-       TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
+       TP_printk("task %s:%d [%d] (%s) ==> %s:%d [%d]",
                 __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+               __entry->prev_state ?
+                 __print_flags(__entry->prev_state, "|",
+                               { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
+                               { 16, "Z" }, { 32, "X" }, { 64, "x" },
+                               { 128, "W" }) : "R",
                 __entry->next_comm, __entry->next_pid, __entry->next_prio)
  );
  
@@ -181,9 +187,9 @@ TRACE_EVENT(sched_switch,
   */
  TRACE_EVENT(sched_migrate_task,
  
-       TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
+       TP_PROTO(struct task_struct *p, int dest_cpu),
  
-       TP_ARGS(p, orig_cpu, dest_cpu),
+       TP_ARGS(p, dest_cpu),
  
         TP_STRUCT__entry(
                 __array(        char,   comm,   TASK_COMM_LEN   )
@@ -197,7 +203,7 @@ TRACE_EVENT(sched_migrate_task,
                 memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
                 __entry->pid            = p->pid;
                 __entry->prio           = p->prio;
-               __entry->orig_cpu       = orig_cpu;
+               __entry->orig_cpu       = task_cpu(p);
                 __entry->dest_cpu       = dest_cpu;
         ),
  
@@ -334,4 +340,7 @@ TRACE_EVENT(sched_signal_send,
                   __entry->sig, __entry->comm, __entry->pid)
  );
  
-#undef TRACE_SYSTEM
+#endif /* _TRACE_SCHED_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h

new file mode 100644 (file)

index 0000000..1e8fabb
--- /dev/null
+++ b/include/trace/events/skb.h
@@ -0,0 +1,40 @@
+#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SKB_H
+
+#include <linux/skbuff.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM skb
+
+/*
+ * Tracepoint for free an sk_buff:
+ */
+TRACE_EVENT(kfree_skb,
+
+       TP_PROTO(struct sk_buff *skb, void *location),
+
+       TP_ARGS(skb, location),
+
+       TP_STRUCT__entry(
+               __field(        void *,         skbaddr         )
+               __field(        unsigned short, protocol        )
+               __field(        void *,         location        )
+       ),
+
+       TP_fast_assign(
+               __entry->skbaddr = skb;
+               if (skb) {
+                       __entry->protocol = ntohs(skb->protocol);
+               }
+               __entry->location = location;
+       ),
+
+       TP_printk("skbaddr=%p protocol=%u location=%p",
+               __entry->skbaddr, __entry->protocol, __entry->location)
+);
+
+#endif /* _TRACE_SKB_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h

new file mode 100644 (file)

index 0000000..035f1bf
--- /dev/null
+++ b/include/trace/events/workqueue.h
@@ -0,0 +1,100 @@
+#if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_WORKQUEUE_H
+
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM workqueue
+
+TRACE_EVENT(workqueue_insertion,
+
+       TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
+
+       TP_ARGS(wq_thread, work),
+
+       TP_STRUCT__entry(
+               __array(char,           thread_comm,    TASK_COMM_LEN)
+               __field(pid_t,          thread_pid)
+               __field(work_func_t,    func)
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+               __entry->thread_pid     = wq_thread->pid;
+               __entry->func           = work->func;
+       ),
+
+       TP_printk("thread=%s:%d func=%pF", __entry->thread_comm,
+               __entry->thread_pid, __entry->func)
+);
+
+TRACE_EVENT(workqueue_execution,
+
+       TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
+
+       TP_ARGS(wq_thread, work),
+
+       TP_STRUCT__entry(
+               __array(char,           thread_comm,    TASK_COMM_LEN)
+               __field(pid_t,          thread_pid)
+               __field(work_func_t,    func)
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+               __entry->thread_pid     = wq_thread->pid;
+               __entry->func           = work->func;
+       ),
+
+       TP_printk("thread=%s:%d func=%pF", __entry->thread_comm,
+               __entry->thread_pid, __entry->func)
+);
+
+/* Trace the creation of one workqueue thread on a cpu */
+TRACE_EVENT(workqueue_creation,
+
+       TP_PROTO(struct task_struct *wq_thread, int cpu),
+
+       TP_ARGS(wq_thread, cpu),
+
+       TP_STRUCT__entry(
+               __array(char,   thread_comm,    TASK_COMM_LEN)
+               __field(pid_t,  thread_pid)
+               __field(int,    cpu)
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+               __entry->thread_pid     = wq_thread->pid;
+               __entry->cpu            = cpu;
+       ),
+
+       TP_printk("thread=%s:%d cpu=%d", __entry->thread_comm,
+               __entry->thread_pid, __entry->cpu)
+);
+
+TRACE_EVENT(workqueue_destruction,
+
+       TP_PROTO(struct task_struct *wq_thread),
+
+       TP_ARGS(wq_thread),
+
+       TP_STRUCT__entry(
+               __array(char,   thread_comm,    TASK_COMM_LEN)
+               __field(pid_t,  thread_pid)
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+               __entry->thread_pid     = wq_thread->pid;
+       ),
+
+       TP_printk("thread=%s:%d", __entry->thread_comm, __entry->thread_pid)
+);
+
+#endif /* _TRACE_WORKQUEUE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h

new file mode 100644 (file)

index 0000000..1867553
--- /dev/null
+++ b/include/trace/ftrace.h
@@ -0,0 +1,591 @@
+/*
+ * Stage 1 of the trace events.
+ *
+ * Override the macros in <trace/trace_events.h> to include the following:
+ *
+ * struct ftrace_raw_<call> {
+ *     struct trace_entry              ent;
+ *     <type>                          <item>;
+ *     <type2>                         <item2>[<len>];
+ *     [...]
+ * };
+ *
+ * The <type> <item> is created by the __field(type, item) macro or
+ * the __array(type2, item2, len) macro.
+ * We simply do "type item;", and that will create the fields
+ * in the structure.
+ */
+
+#include <linux/ftrace_event.h>
+
+#undef __field
+#define __field(type, item)            type    item;
+
+#undef __array
+#define __array(type, item, len)       type    item[len];
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len) unsigned short __data_loc_##item;
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, -1)
+
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
+       struct ftrace_raw_##name {                              \
+               struct trace_entry      ent;                    \
+               tstruct                                         \
+               char                    __data[0];              \
+       };                                                      \
+       static struct ftrace_event_call event_##name
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+
+/*
+ * Stage 2 of the trace events.
+ *
+ * Include the following:
+ *
+ * struct ftrace_data_offsets_<call> {
+ *     int                             <item1>;
+ *     int                             <item2>;
+ *     [...]
+ * };
+ *
+ * The __dynamic_array() macro will create each int <item>, this is
+ * to keep the offset of each array from the beginning of the event.
+ */
+
+#undef __field
+#define __field(type, item);
+
+#undef __array
+#define __array(type, item, len)
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)       int item;
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, -1)
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
+       struct ftrace_data_offsets_##call {                             \
+               tstruct;                                                \
+       };
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
+ * Setup the showing format of trace point.
+ *
+ * int
+ * ftrace_format_##call(struct trace_seq *s)
+ * {
+ *     struct ftrace_raw_##call field;
+ *     int ret;
+ *
+ *     ret = trace_seq_printf(s, #type " " #item ";"
+ *                            " offset:%u; size:%u;\n",
+ *                            offsetof(struct ftrace_raw_##call, item),
+ *                            sizeof(field.type));
+ *
+ * }
+ */
+
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef __field
+#define __field(type, item)                                    \
+       ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
+                              "offset:%u;\tsize:%u;\n",                \
+                              (unsigned int)offsetof(typeof(field), item), \
+                              (unsigned int)sizeof(field.item));       \
+       if (!ret)                                                       \
+               return 0;
+
+#undef __array
+#define __array(type, item, len)                                               \
+       ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"    \
+                              "offset:%u;\tsize:%u;\n",                \
+                              (unsigned int)offsetof(typeof(field), item), \
+                              (unsigned int)sizeof(field.item));       \
+       if (!ret)                                                       \
+               return 0;
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)                                      \
+       ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t"            \
+                              "offset:%u;\tsize:%u;\n",                       \
+                              (unsigned int)offsetof(typeof(field),           \
+                                       __data_loc_##item),                    \
+                              (unsigned int)sizeof(field.__data_loc_##item)); \
+       if (!ret)                                                              \
+               return 0;
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, -1)
+
+#undef __entry
+#define __entry REC
+
+#undef __print_symbolic
+#undef __get_dynamic_array
+#undef __get_str
+
+#undef TP_printk
+#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
+
+#undef TP_fast_assign
+#define TP_fast_assign(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)           \
+static int                                                             \
+ftrace_format_##call(struct trace_seq *s)                              \
+{                                                                      \
+       struct ftrace_raw_##call field __attribute__((unused));         \
+       int ret = 0;                                                    \
+                                                                       \
+       tstruct;                                                        \
+                                                                       \
+       trace_seq_printf(s, "\nprint fmt: " print);                     \
+                                                                       \
+       return ret;                                                     \
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
+ * Stage 3 of the trace events.
+ *
+ * Override the macros in <trace/trace_events.h> to include the following:
+ *
+ * enum print_line_t
+ * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
+ * {
+ *     struct trace_seq *s = &iter->seq;
+ *     struct ftrace_raw_<call> *field; <-- defined in stage 1
+ *     struct trace_entry *entry;
+ *     struct trace_seq *p;
+ *     int ret;
+ *
+ *     entry = iter->ent;
+ *
+ *     if (entry->type != event_<call>.id) {
+ *             WARN_ON_ONCE(1);
+ *             return TRACE_TYPE_UNHANDLED;
+ *     }
+ *
+ *     field = (typeof(field))entry;
+ *
+ *     p = get_cpu_var(ftrace_event_seq);
+ *     trace_seq_init(p);
+ *     ret = trace_seq_printf(s, <TP_printk> "\n");
+ *     put_cpu();
+ *     if (!ret)
+ *             return TRACE_TYPE_PARTIAL_LINE;
+ *
+ *     return TRACE_TYPE_HANDLED;
+ * }
+ *
+ * This is the method used to print the raw event to the trace
+ * output format. Note, this is not needed if the data is read
+ * in binary.
+ */
+
+#undef __entry
+#define __entry field
+
+#undef TP_printk
+#define TP_printk(fmt, args...) fmt "\n", args
+
+#undef __get_dynamic_array
+#define __get_dynamic_array(field)     \
+               ((void *)__entry + __entry->__data_loc_##field)
+
+#undef __get_str
+#define __get_str(field) (char *)__get_dynamic_array(field)
+
+#undef __print_flags
+#define __print_flags(flag, delim, flag_array...)                      \
+       ({                                                              \
+               static const struct trace_print_flags flags[] =         \
+                       { flag_array, { -1, NULL }};                    \
+               ftrace_print_flags_seq(p, delim, flag, flags);          \
+       })
+
+#undef __print_symbolic
+#define __print_symbolic(value, symbol_array...)                       \
+       ({                                                              \
+               static const struct trace_print_flags symbols[] =       \
+                       { symbol_array, { -1, NULL }};                  \
+               ftrace_print_symbols_seq(p, value, symbols);            \
+       })
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
+enum print_line_t                                                      \
+ftrace_raw_output_##call(struct trace_iterator *iter, int flags)       \
+{                                                                      \
+       struct trace_seq *s = &iter->seq;                               \
+       struct ftrace_raw_##call *field;                                \
+       struct trace_entry *entry;                                      \
+       struct trace_seq *p;                                            \
+       int ret;                                                        \
+                                                                       \
+       entry = iter->ent;                                              \
+                                                                       \
+       if (entry->type != event_##call.id) {                           \
+               WARN_ON_ONCE(1);                                        \
+               return TRACE_TYPE_UNHANDLED;                            \
+       }                                                               \
+                                                                       \
+       field = (typeof(field))entry;                                   \
+                                                                       \
+       p = &get_cpu_var(ftrace_event_seq);                             \
+       trace_seq_init(p);                                              \
+       ret = trace_seq_printf(s, #call ": " print);                    \
+       put_cpu();                                                      \
+       if (!ret)                                                       \
+               return TRACE_TYPE_PARTIAL_LINE;                         \
+                                                                       \
+       return TRACE_TYPE_HANDLED;                                      \
+}
+       
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+#undef __field
+#define __field(type, item)                                            \
+       ret = trace_define_field(event_call, #type, #item,              \
+                                offsetof(typeof(field), item),         \
+                                sizeof(field.item), is_signed_type(type));     \
+       if (ret)                                                        \
+               return ret;
+
+#undef __array
+#define __array(type, item, len)                                       \
+       BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);                         \
+       ret = trace_define_field(event_call, #type "[" #len "]", #item, \
+                                offsetof(typeof(field), item),         \
+                                sizeof(field.item), 0);                \
+       if (ret)                                                        \
+               return ret;
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)                                      \
+       ret = trace_define_field(event_call, "__data_loc" "[" #type "]", #item,\
+                               offsetof(typeof(field), __data_loc_##item),    \
+                                sizeof(field.__data_loc_##item), 0);
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, -1)
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)           \
+int                                                                    \
+ftrace_define_fields_##call(void)                                      \
+{                                                                      \
+       struct ftrace_raw_##call field;                                 \
+       struct ftrace_event_call *event_call = &event_##call;           \
+       int ret;                                                        \
+                                                                       \
+       __common_field(int, type, 1);                                   \
+       __common_field(unsigned char, flags, 0);                        \
+       __common_field(unsigned char, preempt_count, 0);                \
+       __common_field(int, pid, 1);                                    \
+       __common_field(int, tgid, 1);                                   \
+                                                                       \
+       tstruct;                                                        \
+                                                                       \
+       return ret;                                                     \
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
+ * remember the offset of each array from the beginning of the event.
+ */
+
+#undef __entry
+#define __entry entry
+
+#undef __field
+#define __field(type, item)
+
+#undef __array
+#define __array(type, item, len)
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)                               \
+       __data_offsets->item = __data_size +                            \
+                              offsetof(typeof(*entry), __data);        \
+       __data_size += (len) * sizeof(type);
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, strlen(src) + 1)       \
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
+static inline int ftrace_get_offsets_##call(                           \
+       struct ftrace_data_offsets_##call *__data_offsets, proto)       \
+{                                                                      \
+       int __data_size = 0;                                            \
+       struct ftrace_raw_##call __maybe_unused *entry;                 \
+                                                                       \
+       tstruct;                                                        \
+                                                                       \
+       return __data_size;                                             \
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
+ * Stage 4 of the trace events.
+ *
+ * Override the macros in <trace/trace_events.h> to include the following:
+ *
+ * static void ftrace_event_<call>(proto)
+ * {
+ *     event_trace_printk(_RET_IP_, "<call>: " <fmt>);
+ * }
+ *
+ * static int ftrace_reg_event_<call>(void)
+ * {
+ *     int ret;
+ *
+ *     ret = register_trace_<call>(ftrace_event_<call>);
+ *     if (!ret)
+ *             pr_info("event trace: Could not activate trace point "
+ *                     "probe to  <call>");
+ *     return ret;
+ * }
+ *
+ * static void ftrace_unreg_event_<call>(void)
+ * {
+ *     unregister_trace_<call>(ftrace_event_<call>);
+ * }
+ *
+ *
+ * For those macros defined with TRACE_EVENT:
+ *
+ * static struct ftrace_event_call event_<call>;
+ *
+ * static void ftrace_raw_event_<call>(proto)
+ * {
+ *     struct ring_buffer_event *event;
+ *     struct ftrace_raw_<call> *entry; <-- defined in stage 1
+ *     unsigned long irq_flags;
+ *     int pc;
+ *
+ *     local_save_flags(irq_flags);
+ *     pc = preempt_count();
+ *
+ *     event = trace_current_buffer_lock_reserve(event_<call>.id,
+ *                               sizeof(struct ftrace_raw_<call>),
+ *                               irq_flags, pc);
+ *     if (!event)
+ *             return;
+ *     entry   = ring_buffer_event_data(event);
+ *
+ *     <assign>;  <-- Here we assign the entries by the __field and
+ *                     __array macros.
+ *
+ *     trace_current_buffer_unlock_commit(event, irq_flags, pc);
+ * }
+ *
+ * static int ftrace_raw_reg_event_<call>(void)
+ * {
+ *     int ret;
+ *
+ *     ret = register_trace_<call>(ftrace_raw_event_<call>);
+ *     if (!ret)
+ *             pr_info("event trace: Could not activate trace point "
+ *                     "probe to <call>");
+ *     return ret;
+ * }
+ *
+ * static void ftrace_unreg_event_<call>(void)
+ * {
+ *     unregister_trace_<call>(ftrace_raw_event_<call>);
+ * }
+ *
+ * static struct trace_event ftrace_event_type_<call> = {
+ *     .trace                  = ftrace_raw_output_<call>, <-- stage 2
+ * };
+ *
+ * static int ftrace_raw_init_event_<call>(void)
+ * {
+ *     int id;
+ *
+ *     id = register_ftrace_event(&ftrace_event_type_<call>);
+ *     if (!id)
+ *             return -ENODEV;
+ *     event_<call>.id = id;
+ *     return 0;
+ * }
+ *
+ * static struct ftrace_event_call __used
+ * __attribute__((__aligned__(4)))
+ * __attribute__((section("_ftrace_events"))) event_<call> = {
+ *     .name                   = "<call>",
+ *     .system                 = "<system>",
+ *     .raw_init               = ftrace_raw_init_event_<call>,
+ *     .regfunc                = ftrace_reg_event_<call>,
+ *     .unregfunc              = ftrace_unreg_event_<call>,
+ *     .show_format            = ftrace_format_<call>,
+ * }
+ *
+ */
+
+#undef TP_FMT
+#define TP_FMT(fmt, args...)   fmt "\n", ##args
+
+#ifdef CONFIG_EVENT_PROFILE
+#define _TRACE_PROFILE(call, proto, args)                              \
+static void ftrace_profile_##call(proto)                               \
+{                                                                      \
+       extern void perf_tpcounter_event(int);                          \
+       perf_tpcounter_event(event_##call.id);                          \
+}                                                                      \
+                                                                       \
+static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \
+{                                                                      \
+       int ret = 0;                                                    \
+                                                                       \
+       if (!atomic_inc_return(&event_call->profile_count))             \
+               ret = register_trace_##call(ftrace_profile_##call);     \
+                                                                       \
+       return ret;                                                     \
+}                                                                      \
+                                                                       \
+static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
+{                                                                      \
+       if (atomic_add_negative(-1, &event_call->profile_count))        \
+               unregister_trace_##call(ftrace_profile_##call);         \
+}
+
+#define _TRACE_PROFILE_INIT(call)                                      \
+       .profile_count = ATOMIC_INIT(-1),                               \
+       .profile_enable = ftrace_profile_enable_##call,                 \
+       .profile_disable = ftrace_profile_disable_##call,
+
+#else
+#define _TRACE_PROFILE(call, proto, args)
+#define _TRACE_PROFILE_INIT(call)
+#endif
+
+#undef __entry
+#define __entry entry
+
+#undef __field
+#define __field(type, item)
+
+#undef __array
+#define __array(type, item, len)
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)                               \
+       __entry->__data_loc_##item = __data_offsets.item;
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, -1)            \
+
+#undef __assign_str
+#define __assign_str(dst, src)                                         \
+       strcpy(__get_str(dst), src);
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
+_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))                      \
+                                                                       \
+static struct ftrace_event_call event_##call;                          \
+                                                                       \
+static void ftrace_raw_event_##call(proto)                             \
+{                                                                      \
+       struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
+       struct ftrace_event_call *event_call = &event_##call;           \
+       struct ring_buffer_event *event;                                \
+       struct ftrace_raw_##call *entry;                                \
+       unsigned long irq_flags;                                        \
+       int __data_size;                                                \
+       int pc;                                                         \
+                                                                       \
+       local_save_flags(irq_flags);                                    \
+       pc = preempt_count();                                           \
+                                                                       \
+       __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
+                                                                       \
+       event = trace_current_buffer_lock_reserve(event_##call.id,      \
+                                sizeof(*entry) + __data_size,          \
+                                irq_flags, pc);                        \
+       if (!event)                                                     \
+               return;                                                 \
+       entry   = ring_buffer_event_data(event);                        \
+                                                                       \
+                                                                       \
+       tstruct                                                         \
+                                                                       \
+       { assign; }                                                     \
+                                                                       \
+       if (!filter_current_check_discard(event_call, entry, event))    \
+               trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
+}                                                                      \
+                                                                       \
+static int ftrace_raw_reg_event_##call(void)                           \
+{                                                                      \
+       int ret;                                                        \
+                                                                       \
+       ret = register_trace_##call(ftrace_raw_event_##call);           \
+       if (ret)                                                        \
+               pr_info("event trace: Could not activate trace point "  \
+                       "probe to " #call "\n");                        \
+       return ret;                                                     \
+}                                                                      \
+                                                                       \
+static void ftrace_raw_unreg_event_##call(void)                                \
+{                                                                      \
+       unregister_trace_##call(ftrace_raw_event_##call);               \
+}                                                                      \
+                                                                       \
+static struct trace_event ftrace_event_type_##call = {                 \
+       .trace                  = ftrace_raw_output_##call,             \
+};                                                                     \
+                                                                       \
+static int ftrace_raw_init_event_##call(void)                          \
+{                                                                      \
+       int id;                                                         \
+                                                                       \
+       id = register_ftrace_event(&ftrace_event_type_##call);          \
+       if (!id)                                                        \
+               return -ENODEV;                                         \
+       event_##call.id = id;                                           \
+       INIT_LIST_HEAD(&event_##call.fields);                           \
+       init_preds(&event_##call);                                      \
+       return 0;                                                       \
+}                                                                      \
+                                                                       \
+static struct ftrace_event_call __used                                 \
+__attribute__((__aligned__(4)))                                                \
+__attribute__((section("_ftrace_events"))) event_##call = {            \
+       .name                   = #call,                                \
+       .system                 = __stringify(TRACE_SYSTEM),            \
+       .event                  = &ftrace_event_type_##call,            \
+       .raw_init               = ftrace_raw_init_event_##call,         \
+       .regfunc                = ftrace_raw_reg_event_##call,          \
+       .unregfunc              = ftrace_raw_unreg_event_##call,        \
+       .show_format            = ftrace_format_##call,                 \
+       .define_fields          = ftrace_define_fields_##call,          \
+       _TRACE_PROFILE_INIT(call)                                       \
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+#undef _TRACE_PROFILE
+#undef _TRACE_PROFILE_INIT
+
diff --git a/include/trace/irq.h b/include/trace/irq.h

deleted file mode 100644 (file)

index ff5d449..0000000
--- a/include/trace/irq.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _TRACE_IRQ_H
-#define _TRACE_IRQ_H
-
-#include <linux/interrupt.h>
-#include <linux/tracepoint.h>
-
-#include <trace/irq_event_types.h>
-
-#endif
diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h

deleted file mode 100644 (file)

index 85964eb..0000000
--- a/include/trace/irq_event_types.h
+++ /dev/null
@@ -1,55 +0,0 @@
-
-/* use <trace/irq.h> instead */
-#ifndef TRACE_FORMAT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM irq
-
-/*
- * Tracepoint for entry of interrupt handler:
- */
-TRACE_FORMAT(irq_handler_entry,
-       TP_PROTO(int irq, struct irqaction *action),
-       TP_ARGS(irq, action),
-       TP_FMT("irq=%d handler=%s", irq, action->name)
-       );
-
-/*
- * Tracepoint for return of an interrupt handler:
- */
-TRACE_EVENT(irq_handler_exit,
-
-       TP_PROTO(int irq, struct irqaction *action, int ret),
-
-       TP_ARGS(irq, action, ret),
-
-       TP_STRUCT__entry(
-               __field(        int,    irq     )
-               __field(        int,    ret     )
-       ),
-
-       TP_fast_assign(
-               __entry->irq    = irq;
-               __entry->ret    = ret;
-       ),
-
-       TP_printk("irq=%d return=%s",
-                 __entry->irq, __entry->ret ? "handled" : "unhandled")
-);
-
-TRACE_FORMAT(softirq_entry,
-       TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
-       TP_ARGS(h, vec),
-       TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-       );
-
-TRACE_FORMAT(softirq_exit,
-       TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
-       TP_ARGS(h, vec),
-       TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-       );
-
-#undef TRACE_SYSTEM
diff --git a/include/trace/kmemtrace.h b/include/trace/kmemtrace.h

deleted file mode 100644 (file)

index 28ee69f..0000000
--- a/include/trace/kmemtrace.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- *
- * This file is released under GPL version 2.
- */
-
-#ifndef _LINUX_KMEMTRACE_H
-#define _LINUX_KMEMTRACE_H
-
-#ifdef __KERNEL__
-
-#include <linux/tracepoint.h>
-#include <linux/types.h>
-
-#ifdef CONFIG_KMEMTRACE
-extern void kmemtrace_init(void);
-#else
-static inline void kmemtrace_init(void)
-{
-}
-#endif
-
-DECLARE_TRACE(kmalloc,
-             TP_PROTO(unsigned long call_site,
-                     const void *ptr,
-                     size_t bytes_req,
-                     size_t bytes_alloc,
-                     gfp_t gfp_flags),
-             TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
-DECLARE_TRACE(kmem_cache_alloc,
-             TP_PROTO(unsigned long call_site,
-                     const void *ptr,
-                     size_t bytes_req,
-                     size_t bytes_alloc,
-                     gfp_t gfp_flags),
-             TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
-DECLARE_TRACE(kmalloc_node,
-             TP_PROTO(unsigned long call_site,
-                     const void *ptr,
-                     size_t bytes_req,
-                     size_t bytes_alloc,
-                     gfp_t gfp_flags,
-                     int node),
-             TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
-DECLARE_TRACE(kmem_cache_alloc_node,
-             TP_PROTO(unsigned long call_site,
-                     const void *ptr,
-                     size_t bytes_req,
-                     size_t bytes_alloc,
-                     gfp_t gfp_flags,
-                     int node),
-             TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
-DECLARE_TRACE(kfree,
-             TP_PROTO(unsigned long call_site, const void *ptr),
-             TP_ARGS(call_site, ptr));
-DECLARE_TRACE(kmem_cache_free,
-             TP_PROTO(unsigned long call_site, const void *ptr),
-             TP_ARGS(call_site, ptr));
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_KMEMTRACE_H */
-
diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h

deleted file mode 100644 (file)

index 5ca67df..0000000
--- a/include/trace/lockdep.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _TRACE_LOCKDEP_H
-#define _TRACE_LOCKDEP_H
-
-#include <linux/lockdep.h>
-#include <linux/tracepoint.h>
-
-#include <trace/lockdep_event_types.h>
-
-#endif
diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h

deleted file mode 100644 (file)

index adccfcd..0000000
--- a/include/trace/lockdep_event_types.h
+++ /dev/null
@@ -1,44 +0,0 @@
-
-#ifndef TRACE_FORMAT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM lock
-
-#ifdef CONFIG_LOCKDEP
-
-TRACE_FORMAT(lock_acquire,
-       TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
-               int trylock, int read, int check,
-               struct lockdep_map *next_lock, unsigned long ip),
-       TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
-       TP_FMT("%s%s%s", trylock ? "try " : "",
-               read ? "read " : "", lock->name)
-       );
-
-TRACE_FORMAT(lock_release,
-       TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
-       TP_ARGS(lock, nested, ip),
-       TP_FMT("%s", lock->name)
-       );
-
-#ifdef CONFIG_LOCK_STAT
-
-TRACE_FORMAT(lock_contended,
-       TP_PROTO(struct lockdep_map *lock, unsigned long ip),
-       TP_ARGS(lock, ip),
-       TP_FMT("%s", lock->name)
-       );
-
-TRACE_FORMAT(lock_acquired,
-       TP_PROTO(struct lockdep_map *lock, unsigned long ip),
-       TP_ARGS(lock, ip),
-       TP_FMT("%s", lock->name)
-       );
-
-#endif
-#endif
-
-#undef TRACE_SYSTEM
diff --git a/include/trace/sched.h b/include/trace/sched.h

deleted file mode 100644 (file)

index 4e372a1..0000000
--- a/include/trace/sched.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _TRACE_SCHED_H
-#define _TRACE_SCHED_H
-
-#include <linux/sched.h>
-#include <linux/tracepoint.h>
-
-#include <trace/sched_event_types.h>
-
-#endif
diff --git a/include/trace/skb.h b/include/trace/skb.h

deleted file mode 100644 (file)

index b66206d..0000000
--- a/include/trace/skb.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef _TRACE_SKB_H_
-#define _TRACE_SKB_H_
-
-#include <linux/skbuff.h>
-#include <linux/tracepoint.h>
-
-DECLARE_TRACE(kfree_skb,
-       TP_PROTO(struct sk_buff *skb, void *location),
-       TP_ARGS(skb, location));
-
-#endif
diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h

deleted file mode 100644 (file)

index df56f56..0000000
--- a/include/trace/trace_event_types.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* trace/<type>_event_types.h here */
-
-#include <trace/sched_event_types.h>
-#include <trace/irq_event_types.h>
-#include <trace/lockdep_event_types.h>
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h

deleted file mode 100644 (file)

index fd13750..0000000
--- a/include/trace/trace_events.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* trace/<type>.h here */
-
-#include <trace/sched.h>
-#include <trace/irq.h>
-#include <trace/lockdep.h>
diff --git a/include/trace/workqueue.h b/include/trace/workqueue.h

deleted file mode 100644 (file)

index 7626523..0000000
--- a/include/trace/workqueue.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef __TRACE_WORKQUEUE_H
-#define __TRACE_WORKQUEUE_H
-
-#include <linux/tracepoint.h>
-#include <linux/workqueue.h>
-#include <linux/sched.h>
-
-DECLARE_TRACE(workqueue_insertion,
-          TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
-          TP_ARGS(wq_thread, work));
-
-DECLARE_TRACE(workqueue_execution,
-          TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
-          TP_ARGS(wq_thread, work));
-
-/* Trace the creation of one workqueue thread on a cpu */
-DECLARE_TRACE(workqueue_creation,
-          TP_PROTO(struct task_struct *wq_thread, int cpu),
-          TP_ARGS(wq_thread, cpu));
-
-DECLARE_TRACE(workqueue_destruction,
-          TP_PROTO(struct task_struct *wq_thread),
-          TP_ARGS(wq_thread));
-
-#endif /* __TRACE_WORKQUEUE_H */
diff --git a/include/xen/Kbuild b/include/xen/Kbuild

new file mode 100644 (file)

index 0000000..4e65c16
--- /dev/null
+++ b/include/xen/Kbuild
@@ -0,0 +1 @@
+header-y += evtchn.h
diff --git a/include/xen/events.h b/include/xen/events.h

index 0d5f1adc0363e9e125cb8786ca96b0917440b6b5..e68d59a90ca88b73c21b431c3cf538165916d5a7 100644 (file)
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -53,4 +53,7 @@ bool xen_test_irq_pending(int irq);
     irq will be disabled so it won't deliver an interrupt. */
  void xen_poll_irq(int irq);
  
+/* Determine the IRQ which is bound to an event channel */
+unsigned irq_from_evtchn(unsigned int evtchn);
+
  #endif /* _XEN_EVENTS_H */
diff --git a/include/xen/evtchn.h b/include/xen/evtchn.h

new file mode 100644 (file)

index 0000000..14e833e
--- /dev/null
+++ b/include/xen/evtchn.h
@@ -0,0 +1,88 @@
+/******************************************************************************
+ * evtchn.h
+ *
+ * Interface to /dev/xen/evtchn.
+ *
+ * Copyright (c) 2003-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_EVTCHN_H__
+#define __LINUX_PUBLIC_EVTCHN_H__
+
+/*
+ * Bind a fresh port to VIRQ @virq.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_VIRQ                         \
+       _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
+struct ioctl_evtchn_bind_virq {
+       unsigned int virq;
+};
+
+/*
+ * Bind a fresh port to remote <@remote_domain, @remote_port>.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_INTERDOMAIN                  \
+       _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
+struct ioctl_evtchn_bind_interdomain {
+       unsigned int remote_domain, remote_port;
+};
+
+/*
+ * Allocate a fresh port for binding to @remote_domain.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_UNBOUND_PORT                 \
+       _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
+struct ioctl_evtchn_bind_unbound_port {
+       unsigned int remote_domain;
+};
+
+/*
+ * Unbind previously allocated @port.
+ */
+#define IOCTL_EVTCHN_UNBIND                            \
+       _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
+struct ioctl_evtchn_unbind {
+       unsigned int port;
+};
+
+/*
+ * Unbind previously allocated @port.
+ */
+#define IOCTL_EVTCHN_NOTIFY                            \
+       _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
+struct ioctl_evtchn_notify {
+       unsigned int port;
+};
+
+/* Clear and reinitialise the event buffer. Clear error condition. */
+#define IOCTL_EVTCHN_RESET                             \
+       _IOC(_IOC_NONE, 'E', 5, 0)
+
+#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
diff --git a/include/xen/interface/version.h b/include/xen/interface/version.h

index 453235e923f0be0d3ae801e754bfb83a68caf892..e8b6519d47e9df818ebbb6d19323f032b8dde988 100644 (file)
--- a/include/xen/interface/version.h
+++ b/include/xen/interface/version.h
@@ -57,4 +57,7 @@ struct xen_feature_info {
  /* Declares the features reported by XENVER_get_features. */
  #include "features.h"
  
+/* arg == NULL; returns host memory page size. */
+#define XENVER_pagesize 7
+
  #endif /* __XEN_PUBLIC_VERSION_H__ */
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h

index f87f9614844db69e7eeacb2784646f5458ed954a..b9763badbd77da07813dae421f20a90a08df2e11 100644 (file)
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -91,8 +91,7 @@ struct xenbus_driver {
         void (*otherend_changed)(struct xenbus_device *dev,
                                  enum xenbus_state backend_state);
         int (*remove)(struct xenbus_device *dev);
-       int (*suspend)(struct xenbus_device *dev);
-       int (*suspend_cancel)(struct xenbus_device *dev);
+       int (*suspend)(struct xenbus_device *dev, pm_message_t state);
         int (*resume)(struct xenbus_device *dev);
         int (*uevent)(struct xenbus_device *, char **, int, char *, int);
         struct device_driver driver;
diff --git a/init/Kconfig b/init/Kconfig

index 7be4d3836745a049596e7be06b5e850c035b1f72..9b68fee8d79ef27e78441ee43b496b5a014a7880 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -308,7 +308,7 @@ menu "RCU Subsystem"
  
  choice
         prompt "RCU Implementation"
-       default CLASSIC_RCU
+       default TREE_RCU
  
  config CLASSIC_RCU
         bool "Classic RCU"
@@ -933,6 +933,40 @@ config AIO
            by some high performance threaded applications. Disabling
            this option saves about 7k.
  
+config HAVE_PERF_COUNTERS
+       bool
+
+menu "Performance Counters"
+
+config PERF_COUNTERS
+       bool "Kernel Performance Counters"
+       depends on HAVE_PERF_COUNTERS
+       select ANON_INODES
+       help
+         Enable kernel support for performance counter hardware.
+
+         Performance counters are special hardware registers available
+         on most modern CPUs. These registers count the number of certain
+         types of hw events: such as instructions executed, cachemisses
+         suffered, or branches mis-predicted - without slowing down the
+         kernel or applications. These registers can also trigger interrupts
+         when a threshold number of events have passed - and can thus be
+         used to profile the code that runs on that CPU.
+
+         The Linux Performance Counter subsystem provides an abstraction of
+         these hardware capabilities, available via a system call. It
+         provides per task and per CPU counters, and it provides event
+         capabilities on top of those.
+
+         Say Y if unsure.
+
+config EVENT_PROFILE
+       bool "Tracepoint profile sources"
+       depends on PERF_COUNTERS && EVENT_TRACER
+       default y
+
+endmenu
+
  config VM_EVENT_COUNTERS
         default y
         bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
diff --git a/init/main.c b/init/main.c

index d721dad05dd722fa065170e9f573a4f74c49d38d..bb7dc57eee36ed8079e63bdf1a4c725d4c97e45f 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -64,6 +64,7 @@
  #include <linux/idr.h>
  #include <linux/ftrace.h>
  #include <linux/async.h>
+#include <linux/kmemtrace.h>
  #include <trace/boot.h>
  
  #include <asm/io.h>
@@ -71,7 +72,6 @@
  #include <asm/setup.h>
  #include <asm/sections.h>
  #include <asm/cacheflush.h>
-#include <trace/kmemtrace.h>
  
  #ifdef CONFIG_X86_LOCAL_APIC
  #include <asm/smp.h>
diff --git a/ipc/sem.c b/ipc/sem.c

index 16a2189e96f9458d7626660b4c537dd718bd0fd6..87c2b641fd7b475b192adf281fcdc1f5cf249893 100644 (file)
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1290,8 +1290,8 @@ void exit_sem(struct task_struct *tsk)
                 int i;
  
                 rcu_read_lock();
-               un = list_entry(rcu_dereference(ulp->list_proc.next),
-                                       struct sem_undo, list_proc);
+               un = list_entry_rcu(ulp->list_proc.next,
+                                   struct sem_undo, list_proc);
                 if (&un->list_proc == &ulp->list_proc)
                         semid = -1;
                  else
diff --git a/kernel/Makefile b/kernel/Makefile

index 42423665660a3d6e0a1fc6a37dd0da997643b47f..90b53f6dc226c674d9b05eee60bcc671bab0edc9 100644 (file)
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -93,8 +93,10 @@ obj-$(CONFIG_LATENCYTOP) += latencytop.o
  obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
  obj-$(CONFIG_FUNCTION_TRACER) += trace/
  obj-$(CONFIG_TRACING) += trace/
+obj-$(CONFIG_X86_DS) += trace/
  obj-$(CONFIG_SMP) += sched_cpupri.o
  obj-$(CONFIG_SLOW_WORK) += slow-work.o
+obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
  
  ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
  # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/compat.c b/kernel/compat.c

index 42d56544460f21b353cc279aa8aa52301eded59b..f6c204f07ea6084c4849d52358d6b1b2aab1f0da 100644 (file)
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
  
  }
  
+asmlinkage long
+compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
+                            struct compat_siginfo __user *uinfo)
+{
+       siginfo_t info;
+
+       if (copy_siginfo_from_user32(&info, uinfo))
+               return -EFAULT;
+       return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+
  #ifdef __ARCH_WANT_COMPAT_SYS_TIME
  
  /* compat_time_t is a 32 bit "long" and needs to get converted. */
diff --git a/kernel/exit.c b/kernel/exit.c

index abf9cf3b95c609f12ccb0c6992cc3a8221bdcf8b..49cdf6946f34b4b64b4e61f18dcd417b16f7faa6 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,7 +48,8 @@
  #include <linux/tracehook.h>
  #include <linux/fs_struct.h>
  #include <linux/init_task.h>
-#include <trace/sched.h>
+#include <linux/perf_counter.h>
+#include <trace/events/sched.h>
  
  #include <asm/uaccess.h>
  #include <asm/unistd.h>
@@ -56,10 +57,6 @@
  #include <asm/mmu_context.h>
  #include "cred-internals.h"
  
-DEFINE_TRACE(sched_process_free);
-DEFINE_TRACE(sched_process_exit);
-DEFINE_TRACE(sched_process_wait);
-
  static void exit_mm(struct task_struct * tsk);
  
  static void __unhash_process(struct task_struct *p)
@@ -158,6 +155,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
  {
         struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
  
+#ifdef CONFIG_PERF_COUNTERS
+       WARN_ON_ONCE(tsk->perf_counter_ctxp);
+#endif
         trace_sched_process_free(tsk);
         put_task_struct(tsk);
  }
@@ -174,6 +174,7 @@ repeat:
         atomic_dec(&__task_cred(p)->user->processes);
  
         proc_flush_task(p);
+
         write_lock_irq(&tasklist_lock);
         tracehook_finish_release_task(p);
         __exit_signal(p);
@@ -975,16 +976,19 @@ NORET_TYPE void do_exit(long code)
                 module_put(tsk->binfmt->module);
  
         proc_exit_connector(tsk);
+
+       /*
+        * Flush inherited counters to the parent - before the parent
+        * gets woken up by child-exit notifications.
+        */
+       perf_counter_exit_task(tsk);
+
         exit_notify(tsk, group_dead);
  #ifdef CONFIG_NUMA
         mpol_put(tsk->mempolicy);
         tsk->mempolicy = NULL;
  #endif
  #ifdef CONFIG_FUTEX
-       /*
-        * This must happen late, after the PID is not
-        * hashed anymore:
-        */
         if (unlikely(!list_empty(&tsk->pi_state_list)))
                 exit_pi_state_list(tsk);
         if (unlikely(current->pi_state_cache))
diff --git a/kernel/fork.c b/kernel/fork.c

index 875ffbdd96d09ccb2a9587636f472b8b8278cbfd..4430eb1376f257bd008dab17d3c6ea19cc9c2d3b 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -61,8 +61,8 @@
  #include <linux/proc_fs.h>
  #include <linux/blkdev.h>
  #include <linux/fs_struct.h>
-#include <trace/sched.h>
  #include <linux/magic.h>
+#include <linux/perf_counter.h>
  
  #include <asm/pgtable.h>
  #include <asm/pgalloc.h>
@@ -71,6 +71,8 @@
  #include <asm/cacheflush.h>
  #include <asm/tlbflush.h>
  
+#include <trace/events/sched.h>
+
  /*
   * Protected counters by write_lock_irq(&tasklist_lock)
   */
@@ -83,8 +85,6 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
  
  __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
  
-DEFINE_TRACE(sched_process_fork);
-
  int nr_processes(void)
  {
         int cpu;
@@ -982,6 +982,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         if (!p)
                 goto fork_out;
  
+       ftrace_graph_init_task(p);
+
         rt_mutex_init_task(p);
  
  #ifdef CONFIG_PROVE_LOCKING
@@ -1089,12 +1091,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
  #ifdef CONFIG_DEBUG_MUTEXES
         p->blocked_on = NULL; /* not blocked yet */
  #endif
-       if (unlikely(current->ptrace))
-               ptrace_fork(p, clone_flags);
+
+       p->bts = NULL;
  
         /* Perform scheduler related setup. Assign this task to a CPU. */
         sched_fork(p, clone_flags);
  
+       retval = perf_counter_init_task(p);
+       if (retval)
+               goto bad_fork_cleanup_policy;
+
         if ((retval = audit_alloc(p)))
                 goto bad_fork_cleanup_policy;
         /* copy all the process information */
@@ -1131,8 +1137,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                 }
         }
  
-       ftrace_graph_init_task(p);
-
         p->pid = pid_nr(pid);
         p->tgid = p->pid;
         if (clone_flags & CLONE_THREAD)
@@ -1141,7 +1145,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         if (current->nsproxy != p->nsproxy) {
                 retval = ns_cgroup_clone(p, pid);
                 if (retval)
-                       goto bad_fork_free_graph;
+                       goto bad_fork_free_pid;
         }
  
         p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1233,7 +1237,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                 spin_unlock(&current->sighand->siglock);
                 write_unlock_irq(&tasklist_lock);
                 retval = -ERESTARTNOINTR;
-               goto bad_fork_free_graph;
+               goto bad_fork_free_pid;
         }
  
         if (clone_flags & CLONE_THREAD) {
@@ -1268,8 +1272,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         cgroup_post_fork(p);
         return p;
  
-bad_fork_free_graph:
-       ftrace_graph_exit_task(p);
  bad_fork_free_pid:
         if (pid != &init_struct_pid)
                 free_pid(pid);
@@ -1293,6 +1295,7 @@ bad_fork_cleanup_semundo:
  bad_fork_cleanup_audit:
         audit_free(p);
  bad_fork_cleanup_policy:
+       perf_counter_free_task(p);
  #ifdef CONFIG_NUMA
         mpol_put(p->mempolicy);
  bad_fork_cleanup_cgroup:
@@ -1406,6 +1409,12 @@ long do_fork(unsigned long clone_flags,
                 if (clone_flags & CLONE_VFORK) {
                         p->vfork_done = &vfork;
                         init_completion(&vfork);
+               } else if (!(clone_flags & CLONE_VM)) {
+                       /*
+                        * vfork will do an exec which will call
+                        * set_task_comm()
+                        */
+                       perf_counter_fork(p);
                 }
  
                 audit_finish_fork(p);
diff --git a/kernel/futex.c b/kernel/futex.c

index d546b2d53a62ba700756c8b7ff53b11373923162..80b5ce716596a95b797a00c417b5a514f9d5d2a2 100644 (file)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -19,6 +19,10 @@
   *  PRIVATE futexes by Eric Dumazet
   *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
   *
+ *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
+ *  Copyright (C) IBM Corporation, 2009
+ *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
+ *
   *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
   *  enough at me, Linus for the original (flawed) idea, Matthew
   *  Kirkwood for proof-of-concept implementation.
@@ -96,8 +100,8 @@ struct futex_pi_state {
   */
  struct futex_q {
         struct plist_node list;
-       /* There can only be a single waiter */
-       wait_queue_head_t waiter;
+       /* Waiter reference */
+       struct task_struct *task;
  
         /* Which hash list lock to use: */
         spinlock_t *lock_ptr;
@@ -107,7 +111,9 @@ struct futex_q {
  
         /* Optional priority inheritance state: */
         struct futex_pi_state *pi_state;
-       struct task_struct *task;
+
+       /* rt_waiter storage for requeue_pi: */
+       struct rt_mutex_waiter *rt_waiter;
  
         /* Bitset for the optional bitmasked wakeup */
         u32 bitset;
@@ -278,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key)
         drop_futex_key_refs(key);
  }
  
+/**
+ * futex_top_waiter() - Return the highest priority waiter on a futex
+ * @hb:     the hash bucket the futex_q's reside in
+ * @key:    the futex key (to distinguish it from other futex futex_q's)
+ *
+ * Must be called with the hb lock held.
+ */
+static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
+                                       union futex_key *key)
+{
+       struct futex_q *this;
+
+       plist_for_each_entry(this, &hb->chain, list) {
+               if (match_futex(&this->key, key))
+                       return this;
+       }
+       return NULL;
+}
+
  static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
  {
         u32 curval;
@@ -539,28 +564,160 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
         return 0;
  }
  
+/**
+ * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex
+ * @uaddr:             the pi futex user address
+ * @hb:                        the pi futex hash bucket
+ * @key:               the futex key associated with uaddr and hb
+ * @ps:                        the pi_state pointer where we store the result of the
+ *                     lookup
+ * @task:              the task to perform the atomic lock work for.  This will
+ *                     be "current" except in the case of requeue pi.
+ * @set_waiters:       force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Returns:
+ *  0 - ready to wait
+ *  1 - acquired the lock
+ * <0 - error
+ *
+ * The hb->lock and futex_key refs shall be held by the caller.
+ */
+static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+                               union futex_key *key,
+                               struct futex_pi_state **ps,
+                               struct task_struct *task, int set_waiters)
+{
+       int lock_taken, ret, ownerdied = 0;
+       u32 uval, newval, curval;
+
+retry:
+       ret = lock_taken = 0;
+
+       /*
+        * To avoid races, we attempt to take the lock here again
+        * (by doing a 0 -> TID atomic cmpxchg), while holding all
+        * the locks. It will most likely not succeed.
+        */
+       newval = task_pid_vnr(task);
+       if (set_waiters)
+               newval |= FUTEX_WAITERS;
+
+       curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
+
+       if (unlikely(curval == -EFAULT))
+               return -EFAULT;
+
+       /*
+        * Detect deadlocks.
+        */
+       if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
+               return -EDEADLK;
+
+       /*
+        * Surprise - we got the lock. Just return to userspace:
+        */
+       if (unlikely(!curval))
+               return 1;
+
+       uval = curval;
+
+       /*
+        * Set the FUTEX_WAITERS flag, so the owner will know it has someone
+        * to wake at the next unlock.
+        */
+       newval = curval | FUTEX_WAITERS;
+
+       /*
+        * There are two cases, where a futex might have no owner (the
+        * owner TID is 0): OWNER_DIED. We take over the futex in this
+        * case. We also do an unconditional take over, when the owner
+        * of the futex died.
+        *
+        * This is safe as we are protected by the hash bucket lock !
+        */
+       if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
+               /* Keep the OWNER_DIED bit */
+               newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
+               ownerdied = 0;
+               lock_taken = 1;
+       }
+
+       curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+
+       if (unlikely(curval == -EFAULT))
+               return -EFAULT;
+       if (unlikely(curval != uval))
+               goto retry;
+
+       /*
+        * We took the lock due to owner died take over.
+        */
+       if (unlikely(lock_taken))
+               return 1;
+
+       /*
+        * We dont have the lock. Look up the PI state (or create it if
+        * we are the first waiter):
+        */
+       ret = lookup_pi_state(uval, hb, key, ps);
+
+       if (unlikely(ret)) {
+               switch (ret) {
+               case -ESRCH:
+                       /*
+                        * No owner found for this futex. Check if the
+                        * OWNER_DIED bit is set to figure out whether
+                        * this is a robust futex or not.
+                        */
+                       if (get_futex_value_locked(&curval, uaddr))
+                               return -EFAULT;
+
+                       /*
+                        * We simply start over in case of a robust
+                        * futex. The code above will take the futex
+                        * and return happy.
+                        */
+                       if (curval & FUTEX_OWNER_DIED) {
+                               ownerdied = 1;
+                               goto retry;
+                       }
+               default:
+                       break;
+               }
+       }
+
+       return ret;
+}
+
  /*
   * The hash bucket lock must be held when this is called.
   * Afterwards, the futex_q must not be accessed.
   */
  static void wake_futex(struct futex_q *q)
  {
-       plist_del(&q->list, &q->list.plist);
+       struct task_struct *p = q->task;
+
         /*
-        * The lock in wake_up_all() is a crucial memory barrier after the
-        * plist_del() and also before assigning to q->lock_ptr.
+        * We set q->lock_ptr = NULL _before_ we wake up the task. If
+        * a non futex wake up happens on another CPU then the task
+        * might exit and p would dereference a non existing task
+        * struct. Prevent this by holding a reference on p across the
+        * wake up.
          */
-       wake_up(&q->waiter);
+       get_task_struct(p);
+
+       plist_del(&q->list, &q->list.plist);
         /*
-        * The waiting task can free the futex_q as soon as this is written,
-        * without taking any locks.  This must come last.
-        *
-        * A memory barrier is required here to prevent the following store to
-        * lock_ptr from getting ahead of the wakeup. Clearing the lock at the
-        * end of wake_up() does not prevent this store from moving.
+        * The waiting task can free the futex_q as soon as
+        * q->lock_ptr = NULL is written, without taking any locks. A
+        * memory barrier is required here to prevent the following
+        * store to lock_ptr from getting ahead of the plist_del.
          */
         smp_wmb();
         q->lock_ptr = NULL;
+
+       wake_up_state(p, TASK_NORMAL);
+       put_task_struct(p);
  }
  
  static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -689,7 +846,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
  
         plist_for_each_entry_safe(this, next, head, list) {
                 if (match_futex (&this->key, &key)) {
-                       if (this->pi_state) {
+                       if (this->pi_state || this->rt_waiter) {
                                 ret = -EINVAL;
                                 break;
                         }
@@ -802,24 +959,185 @@ out:
         return ret;
  }
  
-/*
- * Requeue all waiters hashed on one physical page to another
- * physical page.
+/**
+ * requeue_futex() - Requeue a futex_q from one hb to another
+ * @q:         the futex_q to requeue
+ * @hb1:       the source hash_bucket
+ * @hb2:       the target hash_bucket
+ * @key2:      the new key for the requeued futex_q
+ */
+static inline
+void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
+                  struct futex_hash_bucket *hb2, union futex_key *key2)
+{
+
+       /*
+        * If key1 and key2 hash to the same bucket, no need to
+        * requeue.
+        */
+       if (likely(&hb1->chain != &hb2->chain)) {
+               plist_del(&q->list, &hb1->chain);
+               plist_add(&q->list, &hb2->chain);
+               q->lock_ptr = &hb2->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+               q->list.plist.lock = &hb2->lock;
+#endif
+       }
+       get_futex_key_refs(key2);
+       q->key = *key2;
+}
+
+/**
+ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
+ * q:  the futex_q
+ * key:        the key of the requeue target futex
+ *
+ * During futex_requeue, with requeue_pi=1, it is possible to acquire the
+ * target futex if it is uncontended or via a lock steal.  Set the futex_q key
+ * to the requeue target futex so the waiter can detect the wakeup on the right
+ * futex, but remove it from the hb and NULL the rt_waiter so it can detect
+ * atomic lock acquisition.  Must be called with the q->lock_ptr held.
+ */
+static inline
+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
+{
+       drop_futex_key_refs(&q->key);
+       get_futex_key_refs(key);
+       q->key = *key;
+
+       WARN_ON(plist_node_empty(&q->list));
+       plist_del(&q->list, &q->list.plist);
+
+       WARN_ON(!q->rt_waiter);
+       q->rt_waiter = NULL;
+
+       wake_up_state(q->task, TASK_NORMAL);
+}
+
+/**
+ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
+ * @pifutex:           the user address of the to futex
+ * @hb1:               the from futex hash bucket, must be locked by the caller
+ * @hb2:               the to futex hash bucket, must be locked by the caller
+ * @key1:              the from futex key
+ * @key2:              the to futex key
+ * @ps:                        address to store the pi_state pointer
+ * @set_waiters:       force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Try and get the lock on behalf of the top waiter if we can do it atomically.
+ * Wake the top waiter if we succeed.  If the caller specified set_waiters,
+ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
+ * hb1 and hb2 must be held by the caller.
+ *
+ * Returns:
+ *  0 - failed to acquire the lock atomicly
+ *  1 - acquired the lock
+ * <0 - error
+ */
+static int futex_proxy_trylock_atomic(u32 __user *pifutex,
+                                struct futex_hash_bucket *hb1,
+                                struct futex_hash_bucket *hb2,
+                                union futex_key *key1, union futex_key *key2,
+                                struct futex_pi_state **ps, int set_waiters)
+{
+       struct futex_q *top_waiter = NULL;
+       u32 curval;
+       int ret;
+
+       if (get_futex_value_locked(&curval, pifutex))
+               return -EFAULT;
+
+       /*
+        * Find the top_waiter and determine if there are additional waiters.
+        * If the caller intends to requeue more than 1 waiter to pifutex,
+        * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
+        * as we have means to handle the possible fault.  If not, don't set
+        * the bit unecessarily as it will force the subsequent unlock to enter
+        * the kernel.
+        */
+       top_waiter = futex_top_waiter(hb1, key1);
+
+       /* There are no waiters, nothing for us to do. */
+       if (!top_waiter)
+               return 0;
+
+       /*
+        * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
+        * the contended case or if set_waiters is 1.  The pi_state is returned
+        * in ps in contended cases.
+        */
+       ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
+                                  set_waiters);
+       if (ret == 1)
+               requeue_pi_wake_futex(top_waiter, key2);
+
+       return ret;
+}
+
+/**
+ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
+ * uaddr1:     source futex user address
+ * uaddr2:     target futex user address
+ * nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
+ * nr_requeue: number of waiters to requeue (0-INT_MAX)
+ * requeue_pi: if we are attempting to requeue from a non-pi futex to a
+ *             pi futex (pi to pi requeue is not supported)
+ *
+ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
+ * uaddr2 atomically on behalf of the top waiter.
+ *
+ * Returns:
+ * >=0 - on success, the number of tasks requeued or woken
+ *  <0 - on error
   */
  static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
-                        int nr_wake, int nr_requeue, u32 *cmpval)
+                        int nr_wake, int nr_requeue, u32 *cmpval,
+                        int requeue_pi)
  {
         union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+       int drop_count = 0, task_count = 0, ret;
+       struct futex_pi_state *pi_state = NULL;
         struct futex_hash_bucket *hb1, *hb2;
         struct plist_head *head1;
         struct futex_q *this, *next;
-       int ret, drop_count = 0;
+       u32 curval2;
+
+       if (requeue_pi) {
+               /*
+                * requeue_pi requires a pi_state, try to allocate it now
+                * without any locks in case it fails.
+                */
+               if (refill_pi_state_cache())
+                       return -ENOMEM;
+               /*
+                * requeue_pi must wake as many tasks as it can, up to nr_wake
+                * + nr_requeue, since it acquires the rt_mutex prior to
+                * returning to userspace, so as to not leave the rt_mutex with
+                * waiters and no owner.  However, second and third wake-ups
+                * cannot be predicted as they involve race conditions with the
+                * first wake and a fault while looking up the pi_state.  Both
+                * pthread_cond_signal() and pthread_cond_broadcast() should
+                * use nr_wake=1.
+                */
+               if (nr_wake != 1)
+                       return -EINVAL;
+       }
  
  retry:
+       if (pi_state != NULL) {
+               /*
+                * We will have to lookup the pi_state again, so free this one
+                * to keep the accounting correct.
+                */
+               free_pi_state(pi_state);
+               pi_state = NULL;
+       }
+
         ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
         if (unlikely(ret != 0))
                 goto out;
-       ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_READ);
+       ret = get_futex_key(uaddr2, fshared, &key2,
+                           requeue_pi ? VERIFY_WRITE : VERIFY_READ);
         if (unlikely(ret != 0))
                 goto out_put_key1;
  
@@ -854,32 +1172,99 @@ retry_private:
                 }
         }
  
+       if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
+               /*
+                * Attempt to acquire uaddr2 and wake the top waiter. If we
+                * intend to requeue waiters, force setting the FUTEX_WAITERS
+                * bit.  We force this here where we are able to easily handle
+                * faults rather in the requeue loop below.
+                */
+               ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+                                                &key2, &pi_state, nr_requeue);
+
+               /*
+                * At this point the top_waiter has either taken uaddr2 or is
+                * waiting on it.  If the former, then the pi_state will not
+                * exist yet, look it up one more time to ensure we have a
+                * reference to it.
+                */
+               if (ret == 1) {
+                       WARN_ON(pi_state);
+                       task_count++;
+                       ret = get_futex_value_locked(&curval2, uaddr2);
+                       if (!ret)
+                               ret = lookup_pi_state(curval2, hb2, &key2,
+                                                     &pi_state);
+               }
+
+               switch (ret) {
+               case 0:
+                       break;
+               case -EFAULT:
+                       double_unlock_hb(hb1, hb2);
+                       put_futex_key(fshared, &key2);
+                       put_futex_key(fshared, &key1);
+                       ret = get_user(curval2, uaddr2);
+                       if (!ret)
+                               goto retry;
+                       goto out;
+               case -EAGAIN:
+                       /* The owner was exiting, try again. */
+                       double_unlock_hb(hb1, hb2);
+                       put_futex_key(fshared, &key2);
+                       put_futex_key(fshared, &key1);
+                       cond_resched();
+                       goto retry;
+               default:
+                       goto out_unlock;
+               }
+       }
+
         head1 = &hb1->chain;
         plist_for_each_entry_safe(this, next, head1, list) {
-               if (!match_futex (&this->key, &key1))
+               if (task_count - nr_wake >= nr_requeue)
+                       break;
+
+               if (!match_futex(&this->key, &key1))
                         continue;
-               if (++ret <= nr_wake) {
+
+               WARN_ON(!requeue_pi && this->rt_waiter);
+               WARN_ON(requeue_pi && !this->rt_waiter);
+
+               /*
+                * Wake nr_wake waiters.  For requeue_pi, if we acquired the
+                * lock, we already woke the top_waiter.  If not, it will be
+                * woken by futex_unlock_pi().
+                */
+               if (++task_count <= nr_wake && !requeue_pi) {
                         wake_futex(this);
-               } else {
-                       /*
-                        * If key1 and key2 hash to the same bucket, no need to
-                        * requeue.
-                        */
-                       if (likely(head1 != &hb2->chain)) {
-                               plist_del(&this->list, &hb1->chain);
-                               plist_add(&this->list, &hb2->chain);
-                               this->lock_ptr = &hb2->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
-                               this->list.plist.lock = &hb2->lock;
-#endif
-                       }
-                       this->key = key2;
-                       get_futex_key_refs(&key2);
-                       drop_count++;
+                       continue;
+               }
  
-                       if (ret - nr_wake >= nr_requeue)
-                               break;
+               /*
+                * Requeue nr_requeue waiters and possibly one more in the case
+                * of requeue_pi if we couldn't acquire the lock atomically.
+                */
+               if (requeue_pi) {
+                       /* Prepare the waiter to take the rt_mutex. */
+                       atomic_inc(&pi_state->refcount);
+                       this->pi_state = pi_state;
+                       ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+                                                       this->rt_waiter,
+                                                       this->task, 1);
+                       if (ret == 1) {
+                               /* We got the lock. */
+                               requeue_pi_wake_futex(this, &key2);
+                               continue;
+                       } else if (ret) {
+                               /* -EDEADLK */
+                               this->pi_state = NULL;
+                               free_pi_state(pi_state);
+                               goto out_unlock;
+                       }
                 }
+               requeue_futex(this, hb1, hb2, &key2);
+               drop_count++;
         }
  
  out_unlock:
@@ -899,7 +1284,9 @@ out_put_keys:
  out_put_key1:
         put_futex_key(fshared, &key1);
  out:
-       return ret;
+       if (pi_state != NULL)
+               free_pi_state(pi_state);
+       return ret ? ret : task_count;
  }
  
  /* The key must be already stored in q->key. */
@@ -907,8 +1294,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
  {
         struct futex_hash_bucket *hb;
  
-       init_waitqueue_head(&q->waiter);
-
         get_futex_key_refs(&q->key);
         hb = hash_futex(&q->key);
         q->lock_ptr = &hb->lock;
@@ -1119,39 +1504,153 @@ handle_fault:
   */
  #define FLAGS_SHARED           0x01
  #define FLAGS_CLOCKRT          0x02
+#define FLAGS_HAS_TIMEOUT      0x04
  
  static long futex_wait_restart(struct restart_block *restart);
  
-static int futex_wait(u32 __user *uaddr, int fshared,
-                     u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+/**
+ * fixup_owner() - Post lock pi_state and corner case management
+ * @uaddr:     user address of the futex
+ * @fshared:   whether the futex is shared (1) or not (0)
+ * @q:         futex_q (contains pi_state and access to the rt_mutex)
+ * @locked:    if the attempt to take the rt_mutex succeeded (1) or not (0)
+ *
+ * After attempting to lock an rt_mutex, this function is called to cleanup
+ * the pi_state owner as well as handle race conditions that may allow us to
+ * acquire the lock. Must be called with the hb lock held.
+ *
+ * Returns:
+ *  1 - success, lock taken
+ *  0 - success, lock not taken
+ * <0 - on error (-EFAULT)
+ */
+static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
+                      int locked)
  {
-       struct task_struct *curr = current;
-       struct restart_block *restart;
-       DECLARE_WAITQUEUE(wait, curr);
-       struct futex_hash_bucket *hb;
-       struct futex_q q;
-       u32 uval;
-       int ret;
-       struct hrtimer_sleeper t;
-       int rem = 0;
-
-       if (!bitset)
-               return -EINVAL;
+       struct task_struct *owner;
+       int ret = 0;
  
-       q.pi_state = NULL;
-       q.bitset = bitset;
-retry:
-       q.key = FUTEX_KEY_INIT;
-       ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_READ);
-       if (unlikely(ret != 0))
+       if (locked) {
+               /*
+                * Got the lock. We might not be the anticipated owner if we
+                * did a lock-steal - fix up the PI-state in that case:
+                */
+               if (q->pi_state->owner != current)
+                       ret = fixup_pi_state_owner(uaddr, q, current, fshared);
                 goto out;
+       }
  
-retry_private:
-       hb = queue_lock(&q);
+       /*
+        * Catch the rare case, where the lock was released when we were on the
+        * way back before we locked the hash bucket.
+        */
+       if (q->pi_state->owner == current) {
+               /*
+                * Try to get the rt_mutex now. This might fail as some other
+                * task acquired the rt_mutex after we removed ourself from the
+                * rt_mutex waiters list.
+                */
+               if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
+                       locked = 1;
+                       goto out;
+               }
+
+               /*
+                * pi_state is incorrect, some other task did a lock steal and
+                * we returned due to timeout or signal without taking the
+                * rt_mutex. Too late. We can access the rt_mutex_owner without
+                * locking, as the other task is now blocked on the hash bucket
+                * lock. Fix the state up.
+                */
+               owner = rt_mutex_owner(&q->pi_state->pi_mutex);
+               ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
+               goto out;
+       }
  
         /*
-        * Access the page AFTER the hash-bucket is locked.
-        * Order is important:
+        * Paranoia check. If we did not take the lock, then we should not be
+        * the owner, nor the pending owner, of the rt_mutex.
+        */
+       if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
+               printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
+                               "pi-state %p\n", ret,
+                               q->pi_state->pi_mutex.owner,
+                               q->pi_state->owner);
+
+out:
+       return ret ? ret : locked;
+}
+
+/**
+ * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
+ * @hb:                the futex hash bucket, must be locked by the caller
+ * @q:         the futex_q to queue up on
+ * @timeout:   the prepared hrtimer_sleeper, or null for no timeout
+ */
+static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
+                               struct hrtimer_sleeper *timeout)
+{
+       queue_me(q, hb);
+
+       /*
+        * There might have been scheduling since the queue_me(), as we
+        * cannot hold a spinlock across the get_user() in case it
+        * faults, and we cannot just set TASK_INTERRUPTIBLE state when
+        * queueing ourselves into the futex hash. This code thus has to
+        * rely on the futex_wake() code removing us from hash when it
+        * wakes us up.
+        */
+       set_current_state(TASK_INTERRUPTIBLE);
+
+       /* Arm the timer */
+       if (timeout) {
+               hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+               if (!hrtimer_active(&timeout->timer))
+                       timeout->task = NULL;
+       }
+
+       /*
+        * !plist_node_empty() is safe here without any lock.
+        * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
+        */
+       if (likely(!plist_node_empty(&q->list))) {
+               /*
+                * If the timer has already expired, current will already be
+                * flagged for rescheduling. Only call schedule if there
+                * is no timeout, or if it has yet to expire.
+                */
+               if (!timeout || timeout->task)
+                       schedule();
+       }
+       __set_current_state(TASK_RUNNING);
+}
+
+/**
+ * futex_wait_setup() - Prepare to wait on a futex
+ * @uaddr:     the futex userspace address
+ * @val:       the expected value
+ * @fshared:   whether the futex is shared (1) or not (0)
+ * @q:         the associated futex_q
+ * @hb:                storage for hash_bucket pointer to be returned to caller
+ *
+ * Setup the futex_q and locate the hash_bucket.  Get the futex value and
+ * compare it with the expected value.  Handle atomic faults internally.
+ * Return with the hb lock held and a q.key reference on success, and unlocked
+ * with no q.key reference on failure.
+ *
+ * Returns:
+ *  0 - uaddr contains val and hb has been locked
+ * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
+ */
+static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+                          struct futex_q *q, struct futex_hash_bucket **hb)
+{
+       u32 uval;
+       int ret;
+
+       /*
+        * Access the page AFTER the hash-bucket is locked.
+        * Order is important:
          *
          *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
          *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
@@ -1165,95 +1664,83 @@ retry_private:
          * A consequence is that futex_wait() can return zero and absorb
          * a wakeup when *uaddr != val on entry to the syscall.  This is
          * rare, but normal.
-        *
-        * For shared futexes, we hold the mmap semaphore, so the mapping
-        * cannot have changed since we looked it up in get_futex_key.
          */
+retry:
+       q->key = FUTEX_KEY_INIT;
+       ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
+       if (unlikely(ret != 0))
+               return ret;
+
+retry_private:
+       *hb = queue_lock(q);
+
         ret = get_futex_value_locked(&uval, uaddr);
  
-       if (unlikely(ret)) {
-               queue_unlock(&q, hb);
+       if (ret) {
+               queue_unlock(q, *hb);
  
                 ret = get_user(uval, uaddr);
                 if (ret)
-                       goto out_put_key;
+                       goto out;
  
                 if (!fshared)
                         goto retry_private;
  
-               put_futex_key(fshared, &q.key);
+               put_futex_key(fshared, &q->key);
                 goto retry;
         }
-       ret = -EWOULDBLOCK;
-       if (unlikely(uval != val)) {
-               queue_unlock(&q, hb);
-               goto out_put_key;
-       }
  
-       /* Only actually queue if *uaddr contained val.  */
-       queue_me(&q, hb);
+       if (uval != val) {
+               queue_unlock(q, *hb);
+               ret = -EWOULDBLOCK;
+       }
  
-       /*
-        * There might have been scheduling since the queue_me(), as we
-        * cannot hold a spinlock across the get_user() in case it
-        * faults, and we cannot just set TASK_INTERRUPTIBLE state when
-        * queueing ourselves into the futex hash.  This code thus has to
-        * rely on the futex_wake() code removing us from hash when it
-        * wakes us up.
-        */
+out:
+       if (ret)
+               put_futex_key(fshared, &q->key);
+       return ret;
+}
  
-       /* add_wait_queue is the barrier after __set_current_state. */
-       __set_current_state(TASK_INTERRUPTIBLE);
-       add_wait_queue(&q.waiter, &wait);
-       /*
-        * !plist_node_empty() is safe here without any lock.
-        * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
-        */
-       if (likely(!plist_node_empty(&q.list))) {
-               if (!abs_time)
-                       schedule();
-               else {
-                       hrtimer_init_on_stack(&t.timer,
-                                             clockrt ? CLOCK_REALTIME :
-                                             CLOCK_MONOTONIC,
-                                             HRTIMER_MODE_ABS);
-                       hrtimer_init_sleeper(&t, current);
-                       hrtimer_set_expires_range_ns(&t.timer, *abs_time,
-                                                    current->timer_slack_ns);
-
-                       hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
-                       if (!hrtimer_active(&t.timer))
-                               t.task = NULL;
+static int futex_wait(u32 __user *uaddr, int fshared,
+                     u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+{
+       struct hrtimer_sleeper timeout, *to = NULL;
+       struct restart_block *restart;
+       struct futex_hash_bucket *hb;
+       struct futex_q q;
+       int ret;
  
-                       /*
-                        * the timer could have already expired, in which
-                        * case current would be flagged for rescheduling.
-                        * Don't bother calling schedule.
-                        */
-                       if (likely(t.task))
-                               schedule();
+       if (!bitset)
+               return -EINVAL;
  
-                       hrtimer_cancel(&t.timer);
+       q.pi_state = NULL;
+       q.bitset = bitset;
+       q.rt_waiter = NULL;
  
-                       /* Flag if a timeout occured */
-                       rem = (t.task == NULL);
+       if (abs_time) {
+               to = &timeout;
  
-                       destroy_hrtimer_on_stack(&t.timer);
-               }
+               hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+                                     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+               hrtimer_init_sleeper(to, current);
+               hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+                                            current->timer_slack_ns);
         }
-       __set_current_state(TASK_RUNNING);
  
-       /*
-        * NOTE: we don't remove ourselves from the waitqueue because
-        * we are the only user of it.
-        */
+       /* Prepare to wait on uaddr. */
+       ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+       if (ret)
+               goto out;
+
+       /* queue_me and wait for wakeup, timeout, or a signal. */
+       futex_wait_queue_me(hb, &q, to);
  
         /* If we were woken (and unqueued), we succeeded, whatever. */
         ret = 0;
         if (!unqueue_me(&q))
                 goto out_put_key;
         ret = -ETIMEDOUT;
-       if (rem)
+       if (to && !to->task)
                 goto out_put_key;
  
         /*
@@ -1270,7 +1757,7 @@ retry_private:
         restart->futex.val = val;
         restart->futex.time = abs_time->tv64;
         restart->futex.bitset = bitset;
-       restart->futex.flags = 0;
+       restart->futex.flags = FLAGS_HAS_TIMEOUT;
  
         if (fshared)
                 restart->futex.flags |= FLAGS_SHARED;
@@ -1282,6 +1769,10 @@ retry_private:
  out_put_key:
         put_futex_key(fshared, &q.key);
  out:
+       if (to) {
+               hrtimer_cancel(&to->timer);
+               destroy_hrtimer_on_stack(&to->timer);
+       }
         return ret;
  }
  
@@ -1290,13 +1781,16 @@ static long futex_wait_restart(struct restart_block *restart)
  {
         u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
         int fshared = 0;
-       ktime_t t;
+       ktime_t t, *tp = NULL;
  
-       t.tv64 = restart->futex.time;
+       if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
+               t.tv64 = restart->futex.time;
+               tp = &t;
+       }
         restart->fn = do_no_restart_syscall;
         if (restart->futex.flags & FLAGS_SHARED)
                 fshared = 1;
-       return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
+       return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
                                 restart->futex.bitset,
                                 restart->futex.flags & FLAGS_CLOCKRT);
  }
@@ -1312,11 +1806,10 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
                          int detect, ktime_t *time, int trylock)
  {
         struct hrtimer_sleeper timeout, *to = NULL;
-       struct task_struct *curr = current;
         struct futex_hash_bucket *hb;
-       u32 uval, newval, curval;
+       u32 uval;
         struct futex_q q;
-       int ret, lock_taken, ownerdied = 0;
+       int res, ret;
  
         if (refill_pi_state_cache())
                 return -ENOMEM;
@@ -1330,6 +1823,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
         }
  
         q.pi_state = NULL;
+       q.rt_waiter = NULL;
  retry:
         q.key = FUTEX_KEY_INIT;
         ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
@@ -1339,81 +1833,15 @@ retry:
  retry_private:
         hb = queue_lock(&q);
  
-retry_locked:
-       ret = lock_taken = 0;
-
-       /*
-        * To avoid races, we attempt to take the lock here again
-        * (by doing a 0 -> TID atomic cmpxchg), while holding all
-        * the locks. It will most likely not succeed.
-        */
-       newval = task_pid_vnr(current);
-
-       curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
-
-       if (unlikely(curval == -EFAULT))
-               goto uaddr_faulted;
-
-       /*
-        * Detect deadlocks. In case of REQUEUE_PI this is a valid
-        * situation and we return success to user space.
-        */
-       if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
-               ret = -EDEADLK;
-               goto out_unlock_put_key;
-       }
-
-       /*
-        * Surprise - we got the lock. Just return to userspace:
-        */
-       if (unlikely(!curval))
-               goto out_unlock_put_key;
-
-       uval = curval;
-
-       /*
-        * Set the WAITERS flag, so the owner will know it has someone
-        * to wake at next unlock
-        */
-       newval = curval | FUTEX_WAITERS;
-
-       /*
-        * There are two cases, where a futex might have no owner (the
-        * owner TID is 0): OWNER_DIED. We take over the futex in this
-        * case. We also do an unconditional take over, when the owner
-        * of the futex died.
-        *
-        * This is safe as we are protected by the hash bucket lock !
-        */
-       if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
-               /* Keep the OWNER_DIED bit */
-               newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
-               ownerdied = 0;
-               lock_taken = 1;
-       }
-
-       curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
-       if (unlikely(curval == -EFAULT))
-               goto uaddr_faulted;
-       if (unlikely(curval != uval))
-               goto retry_locked;
-
-       /*
-        * We took the lock due to owner died take over.
-        */
-       if (unlikely(lock_taken))
-               goto out_unlock_put_key;
-
-       /*
-        * We dont have the lock. Look up the PI state (or create it if
-        * we are the first waiter):
-        */
-       ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
-
+       ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
         if (unlikely(ret)) {
                 switch (ret) {
-
+               case 1:
+                       /* We got the lock. */
+                       ret = 0;
+                       goto out_unlock_put_key;
+               case -EFAULT:
+                       goto uaddr_faulted;
                 case -EAGAIN:
                         /*
                          * Task is exiting and we just wait for the
@@ -1423,25 +1851,6 @@ retry_locked:
                         put_futex_key(fshared, &q.key);
                         cond_resched();
                         goto retry;
-
-               case -ESRCH:
-                       /*
-                        * No owner found for this futex. Check if the
-                        * OWNER_DIED bit is set to figure out whether
-                        * this is a robust futex or not.
-                        */
-                       if (get_futex_value_locked(&curval, uaddr))
-                               goto uaddr_faulted;
-
-                       /*
-                        * We simply start over in case of a robust
-                        * futex. The code above will take the futex
-                        * and return happy.
-                        */
-                       if (curval & FUTEX_OWNER_DIED) {
-                               ownerdied = 1;
-                               goto retry_locked;
-                       }
                 default:
                         goto out_unlock_put_key;
                 }
@@ -1465,71 +1874,21 @@ retry_locked:
         }
  
         spin_lock(q.lock_ptr);
-
-       if (!ret) {
-               /*
-                * Got the lock. We might not be the anticipated owner
-                * if we did a lock-steal - fix up the PI-state in
-                * that case:
-                */
-               if (q.pi_state->owner != curr)
-                       ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);
-       } else {
-               /*
-                * Catch the rare case, where the lock was released
-                * when we were on the way back before we locked the
-                * hash bucket.
-                */
-               if (q.pi_state->owner == curr) {
-                       /*
-                        * Try to get the rt_mutex now. This might
-                        * fail as some other task acquired the
-                        * rt_mutex after we removed ourself from the
-                        * rt_mutex waiters list.
-                        */
-                       if (rt_mutex_trylock(&q.pi_state->pi_mutex))
-                               ret = 0;
-                       else {
-                               /*
-                                * pi_state is incorrect, some other
-                                * task did a lock steal and we
-                                * returned due to timeout or signal
-                                * without taking the rt_mutex. Too
-                                * late. We can access the
-                                * rt_mutex_owner without locking, as
-                                * the other task is now blocked on
-                                * the hash bucket lock. Fix the state
-                                * up.
-                                */
-                               struct task_struct *owner;
-                               int res;
-
-                               owner = rt_mutex_owner(&q.pi_state->pi_mutex);
-                               res = fixup_pi_state_owner(uaddr, &q, owner,
-                                                          fshared);
-
-                               /* propagate -EFAULT, if the fixup failed */
-                               if (res)
-                                       ret = res;
-                       }
-               } else {
-                       /*
-                        * Paranoia check. If we did not take the lock
-                        * in the trylock above, then we should not be
-                        * the owner of the rtmutex, neither the real
-                        * nor the pending one:
-                        */
-                       if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
-                               printk(KERN_ERR "futex_lock_pi: ret = %d "
-                                      "pi-mutex: %p pi-state %p\n", ret,
-                                      q.pi_state->pi_mutex.owner,
-                                      q.pi_state->owner);
-               }
-       }
+       /*
+        * Fixup the pi_state owner and possibly acquire the lock if we
+        * haven't already.
+        */
+       res = fixup_owner(uaddr, fshared, &q, !ret);
+       /*
+        * If fixup_owner() returned an error, proprogate that.  If it acquired
+        * the lock, clear our -ETIMEDOUT or -EINTR.
+        */
+       if (res)
+               ret = (res < 0) ? res : 0;
  
         /*
-        * If fixup_pi_state_owner() faulted and was unable to handle the
-        * fault, unlock it and return the fault to userspace.
+        * If fixup_owner() faulted and was unable to handle the fault, unlock
+        * it and return the fault to userspace.
          */
         if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
                 rt_mutex_unlock(&q.pi_state->pi_mutex);
@@ -1537,9 +1896,7 @@ retry_locked:
         /* Unqueue and drop the lock */
         unqueue_me_pi(&q);
  
-       if (to)
-               destroy_hrtimer_on_stack(&to->timer);
-       return ret != -EINTR ? ret : -ERESTARTNOINTR;
+       goto out;
  
  out_unlock_put_key:
         queue_unlock(&q, hb);
@@ -1549,7 +1906,7 @@ out_put_key:
  out:
         if (to)
                 destroy_hrtimer_on_stack(&to->timer);
-       return ret;
+       return ret != -EINTR ? ret : -ERESTARTNOINTR;
  
  uaddr_faulted:
         /*
@@ -1572,7 +1929,6 @@ uaddr_faulted:
         goto retry;
  }
  
-
  /*
   * Userspace attempted a TID -> 0 atomic transition, and failed.
   * This is the in-kernel slowpath: we look up the PI state (if any),
@@ -1674,6 +2030,229 @@ pi_faulted:
         return ret;
  }
  
+/**
+ * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
+ * @hb:                the hash_bucket futex_q was original enqueued on
+ * @q:         the futex_q woken while waiting to be requeued
+ * @key2:      the futex_key of the requeue target futex
+ * @timeout:   the timeout associated with the wait (NULL if none)
+ *
+ * Detect if the task was woken on the initial futex as opposed to the requeue
+ * target futex.  If so, determine if it was a timeout or a signal that caused
+ * the wakeup and return the appropriate error code to the caller.  Must be
+ * called with the hb lock held.
+ *
+ * Returns
+ *  0 - no early wakeup detected
+ * <0 - -ETIMEDOUT or -ERESTARTNOINTR
+ */
+static inline
+int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
+                                  struct futex_q *q, union futex_key *key2,
+                                  struct hrtimer_sleeper *timeout)
+{
+       int ret = 0;
+
+       /*
+        * With the hb lock held, we avoid races while we process the wakeup.
+        * We only need to hold hb (and not hb2) to ensure atomicity as the
+        * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
+        * It can't be requeued from uaddr2 to something else since we don't
+        * support a PI aware source futex for requeue.
+        */
+       if (!match_futex(&q->key, key2)) {
+               WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
+               /*
+                * We were woken prior to requeue by a timeout or a signal.
+                * Unqueue the futex_q and determine which it was.
+                */
+               plist_del(&q->list, &q->list.plist);
+               drop_futex_key_refs(&q->key);
+
+               if (timeout && !timeout->task)
+                       ret = -ETIMEDOUT;
+               else
+                       ret = -ERESTARTNOINTR;
+       }
+       return ret;
+}
+
+/**
+ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
+ * @uaddr:     the futex we initialyl wait on (non-pi)
+ * @fshared:   whether the futexes are shared (1) or not (0).  They must be
+ *             the same type, no requeueing from private to shared, etc.
+ * @val:       the expected value of uaddr
+ * @abs_time:  absolute timeout
+ * @bitset:    32 bit wakeup bitset set by userspace, defaults to all.
+ * @clockrt:   whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
+ * @uaddr2:    the pi futex we will take prior to returning to user-space
+ *
+ * The caller will wait on uaddr and will be requeued by futex_requeue() to
+ * uaddr2 which must be PI aware.  Normal wakeup will wake on uaddr2 and
+ * complete the acquisition of the rt_mutex prior to returning to userspace.
+ * This ensures the rt_mutex maintains an owner when it has waiters; without
+ * one, the pi logic wouldn't know which task to boost/deboost, if there was a
+ * need to.
+ *
+ * We call schedule in futex_wait_queue_me() when we enqueue and return there
+ * via the following:
+ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
+ * 2) wakeup on uaddr2 after a requeue and subsequent unlock
+ * 3) signal (before or after requeue)
+ * 4) timeout (before or after requeue)
+ *
+ * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function.
+ *
+ * If 2, we may then block on trying to take the rt_mutex and return via:
+ * 5) successful lock
+ * 6) signal
+ * 7) timeout
+ * 8) other lock acquisition failure
+ *
+ * If 6, we setup a restart_block with futex_lock_pi() as the function.
+ *
+ * If 4 or 7, we cleanup and return with -ETIMEDOUT.
+ *
+ * Returns:
+ *  0 - On success
+ * <0 - On error
+ */
+static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+                                u32 val, ktime_t *abs_time, u32 bitset,
+                                int clockrt, u32 __user *uaddr2)
+{
+       struct hrtimer_sleeper timeout, *to = NULL;
+       struct rt_mutex_waiter rt_waiter;
+       struct rt_mutex *pi_mutex = NULL;
+       struct futex_hash_bucket *hb;
+       union futex_key key2;
+       struct futex_q q;
+       int res, ret;
+
+       if (!bitset)
+               return -EINVAL;
+
+       if (abs_time) {
+               to = &timeout;
+               hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+                                     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+               hrtimer_init_sleeper(to, current);
+               hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+                                            current->timer_slack_ns);
+       }
+
+       /*
+        * The waiter is allocated on our stack, manipulated by the requeue
+        * code while we sleep on uaddr.
+        */
+       debug_rt_mutex_init_waiter(&rt_waiter);
+       rt_waiter.task = NULL;
+
+       q.pi_state = NULL;
+       q.bitset = bitset;
+       q.rt_waiter = &rt_waiter;
+
+       key2 = FUTEX_KEY_INIT;
+       ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
+       if (unlikely(ret != 0))
+               goto out;
+
+       /* Prepare to wait on uaddr. */
+       ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+       if (ret)
+               goto out_key2;
+
+       /* Queue the futex_q, drop the hb lock, wait for wakeup. */
+       futex_wait_queue_me(hb, &q, to);
+
+       spin_lock(&hb->lock);
+       ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
+       spin_unlock(&hb->lock);
+       if (ret)
+               goto out_put_keys;
+
+       /*
+        * In order for us to be here, we know our q.key == key2, and since
+        * we took the hb->lock above, we also know that futex_requeue() has
+        * completed and we no longer have to concern ourselves with a wakeup
+        * race with the atomic proxy lock acquition by the requeue code.
+        */
+
+       /* Check if the requeue code acquired the second futex for us. */
+       if (!q.rt_waiter) {
+               /*
+                * Got the lock. We might not be the anticipated owner if we
+                * did a lock-steal - fix up the PI-state in that case.
+                */
+               if (q.pi_state && (q.pi_state->owner != current)) {
+                       spin_lock(q.lock_ptr);
+                       ret = fixup_pi_state_owner(uaddr2, &q, current,
+                                                  fshared);
+                       spin_unlock(q.lock_ptr);
+               }
+       } else {
+               /*
+                * We have been woken up by futex_unlock_pi(), a timeout, or a
+                * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
+                * the pi_state.
+                */
+               WARN_ON(!&q.pi_state);
+               pi_mutex = &q.pi_state->pi_mutex;
+               ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
+               debug_rt_mutex_free_waiter(&rt_waiter);
+
+               spin_lock(q.lock_ptr);
+               /*
+                * Fixup the pi_state owner and possibly acquire the lock if we
+                * haven't already.
+                */
+               res = fixup_owner(uaddr2, fshared, &q, !ret);
+               /*
+                * If fixup_owner() returned an error, proprogate that.  If it
+                * acquired the lock, clear our -ETIMEDOUT or -EINTR.
+                */
+               if (res)
+                       ret = (res < 0) ? res : 0;
+
+               /* Unqueue and drop the lock. */
+               unqueue_me_pi(&q);
+       }
+
+       /*
+        * If fixup_pi_state_owner() faulted and was unable to handle the
+        * fault, unlock the rt_mutex and return the fault to userspace.
+        */
+       if (ret == -EFAULT) {
+               if (rt_mutex_owner(pi_mutex) == current)
+                       rt_mutex_unlock(pi_mutex);
+       } else if (ret == -EINTR) {
+               /*
+                * We've already been requeued, but we have no way to
+                * restart by calling futex_lock_pi() directly. We
+                * could restart the syscall, but that will look at
+                * the user space value and return right away. So we
+                * drop back with EWOULDBLOCK to tell user space that
+                * "val" has been changed. That's the same what the
+                * restart of the syscall would do in
+                * futex_wait_setup().
+                */
+               ret = -EWOULDBLOCK;
+       }
+
+out_put_keys:
+       put_futex_key(fshared, &q.key);
+out_key2:
+       put_futex_key(fshared, &key2);
+
+out:
+       if (to) {
+               hrtimer_cancel(&to->timer);
+               destroy_hrtimer_on_stack(&to->timer);
+       }
+       return ret;
+}
+
  /*
   * Support for robust futexes: the kernel cleans up held futexes at
   * thread exit time.
@@ -1896,7 +2475,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                 fshared = 1;
  
         clockrt = op & FUTEX_CLOCK_REALTIME;
-       if (clockrt && cmd != FUTEX_WAIT_BITSET)
+       if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
                 return -ENOSYS;
  
         switch (cmd) {
@@ -1911,10 +2490,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                 ret = futex_wake(uaddr, fshared, val, val3);
                 break;
         case FUTEX_REQUEUE:
-               ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
+               ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
                 break;
         case FUTEX_CMP_REQUEUE:
-               ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
+               ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+                                   0);
                 break;
         case FUTEX_WAKE_OP:
                 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
@@ -1931,6 +2511,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                 if (futex_cmpxchg_enabled)
                         ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
                 break;
+       case FUTEX_WAIT_REQUEUE_PI:
+               val3 = FUTEX_BITSET_MATCH_ANY;
+               ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
+                                           clockrt, uaddr2);
+               break;
+       case FUTEX_CMP_REQUEUE_PI:
+               ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+                                   1);
+               break;
         default:
                 ret = -ENOSYS;
         }
@@ -1948,7 +2537,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
         int cmd = op & FUTEX_CMD_MASK;
  
         if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
-                     cmd == FUTEX_WAIT_BITSET)) {
+                     cmd == FUTEX_WAIT_BITSET ||
+                     cmd == FUTEX_WAIT_REQUEUE_PI)) {
                 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
                         return -EFAULT;
                 if (!timespec_valid(&ts))
@@ -1960,11 +2550,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
                 tp = &t;
         }
         /*
-        * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
+        * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
          * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
          */
         if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
-           cmd == FUTEX_WAKE_OP)
+           cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
                 val2 = (u32) (unsigned long) utime;
  
         return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile

index 3394f8f52964b3d9b4433a3560ff94eb32b20cb8..7d047808419da88e273fba8cd1efd88d9aaa5bcd 100644 (file)
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
  obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
  obj-$(CONFIG_PROC_FS) += proc.o
  obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
-obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
+obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
  obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c

index c687ba4363f2b4a95a5c3988998286a1e8ab699b..13c68e71b726c674619633de2f6e200b00adc039 100644 (file)
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
  
         spin_lock(&desc->lock);
         mask_ack_irq(desc, irq);
-       desc = irq_remap_to_desc(irq, desc);
  
         if (unlikely(desc->status & IRQ_INPROGRESS))
                 goto out_unlock;
@@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
         desc->status &= ~IRQ_INPROGRESS;
  out:
         desc->chip->eoi(irq);
-       desc = irq_remap_to_desc(irq, desc);
  
         spin_unlock(&desc->lock);
  }
@@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                     !desc->action)) {
                 desc->status |= (IRQ_PENDING | IRQ_MASKED);
                 mask_ack_irq(desc, irq);
-               desc = irq_remap_to_desc(irq, desc);
                 goto out_unlock;
         }
         kstat_incr_irqs_this_cpu(irq, desc);
@@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
         /* Start handling the irq */
         if (desc->chip->ack)
                 desc->chip->ack(irq);
-       desc = irq_remap_to_desc(irq, desc);
  
         /* Mark the IRQ currently in progress.*/
         desc->status |= IRQ_INPROGRESS;
@@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
         if (!noirqdebug)
                 note_interrupt(irq, desc, action_ret);
  
-       if (desc->chip->eoi) {
+       if (desc->chip->eoi)
                 desc->chip->eoi(irq);
-               desc = irq_remap_to_desc(irq, desc);
-       }
  }
  
  void
@@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
  
         /* Uninstall? */
         if (handle == handle_bad_irq) {
-               if (desc->chip != &no_irq_chip) {
+               if (desc->chip != &no_irq_chip)
                         mask_ack_irq(desc, irq);
-                       desc = irq_remap_to_desc(irq, desc);
-               }
                 desc->status |= IRQ_DISABLED;
                 desc->depth = 1;
         }
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c

index 26e08754744fa13dd9da3e919be6adb3976b2b04..a60018402f4228b46d5870ee93f9e86ae417137a 100644 (file)
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,14 +11,15 @@
   */
  
  #include <linux/irq.h>
+#include <linux/slab.h>
  #include <linux/module.h>
  #include <linux/random.h>
  #include <linux/interrupt.h>
  #include <linux/kernel_stat.h>
  #include <linux/rculist.h>
  #include <linux/hash.h>
-#include <trace/irq.h>
  #include <linux/bootmem.h>
+#include <trace/events/irq.h>
  
  #include "internals.h"
  
@@ -81,45 +82,48 @@ static struct irq_desc irq_desc_init = {
         .lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
  };
  
-void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
  {
-       int node;
         void *ptr;
  
-       node = cpu_to_node(cpu);
-       ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node);
+       if (slab_is_available())
+               ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
+                                  GFP_ATOMIC, node);
+       else
+               ptr = alloc_bootmem_node(NODE_DATA(node),
+                               nr * sizeof(*desc->kstat_irqs));
  
         /*
          * don't overwite if can not get new one
          * init_copy_kstat_irqs() could still use old one
          */
         if (ptr) {
-               printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n",
-                        cpu, node);
+               printk(KERN_DEBUG "  alloc kstat_irqs on node %d\n", node);
                 desc->kstat_irqs = ptr;
         }
  }
  
-static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
+static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
  {
         memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
  
         spin_lock_init(&desc->lock);
         desc->irq = irq;
  #ifdef CONFIG_SMP
-       desc->cpu = cpu;
+       desc->node = node;
  #endif
         lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-       init_kstat_irqs(desc, cpu, nr_cpu_ids);
+       init_kstat_irqs(desc, node, nr_cpu_ids);
         if (!desc->kstat_irqs) {
                 printk(KERN_ERR "can not alloc kstat_irqs\n");
                 BUG_ON(1);
         }
-       if (!init_alloc_desc_masks(desc, cpu, false)) {
+       if (!alloc_desc_masks(desc, node, false)) {
                 printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
                 BUG_ON(1);
         }
-       arch_init_chip_data(desc, cpu);
+       init_desc_masks(desc);
+       arch_init_chip_data(desc, node);
  }
  
  /*
@@ -169,7 +173,8 @@ int __init early_irq_init(void)
                 desc[i].irq = i;
                 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
                 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-               init_alloc_desc_masks(&desc[i], 0, true);
+               alloc_desc_masks(&desc[i], 0, true);
+               init_desc_masks(&desc[i]);
                 irq_desc_ptrs[i] = desc + i;
         }
  
@@ -187,11 +192,10 @@ struct irq_desc *irq_to_desc(unsigned int irq)
         return NULL;
  }
  
-struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
  {
         struct irq_desc *desc;
         unsigned long flags;
-       int node;
  
         if (irq >= nr_irqs) {
                 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
@@ -210,15 +214,17 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
         if (desc)
                 goto out_unlock;
  
-       node = cpu_to_node(cpu);
-       desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-       printk(KERN_DEBUG "  alloc irq_desc for %d on cpu %d node %d\n",
-                irq, cpu, node);
+       if (slab_is_available())
+               desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+       else
+               desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
+
+       printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
         if (!desc) {
                 printk(KERN_ERR "can not alloc irq_desc\n");
                 BUG_ON(1);
         }
-       init_one_irq_desc(irq, desc, cpu);
+       init_one_irq_desc(irq, desc, node);
  
         irq_desc_ptrs[irq] = desc;
  
@@ -256,7 +262,8 @@ int __init early_irq_init(void)
  
         for (i = 0; i < count; i++) {
                 desc[i].irq = i;
-               init_alloc_desc_masks(&desc[i], 0, true);
+               alloc_desc_masks(&desc[i], 0, true);
+               init_desc_masks(&desc[i]);
                 desc[i].kstat_irqs = kstat_irqs_all[i];
         }
         return arch_early_irq_init();
@@ -267,7 +274,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
         return (irq < NR_IRQS) ? irq_desc + irq : NULL;
  }
  
-struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
  {
         return irq_to_desc(irq);
  }
@@ -348,9 +355,6 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
                "but no thread function available.", irq, action->name);
  }
  
-DEFINE_TRACE(irq_handler_entry);
-DEFINE_TRACE(irq_handler_exit);
-
  /**
   * handle_IRQ_event - irq action chain handler
   * @irq:       the interrupt number
@@ -453,11 +457,8 @@ unsigned int __do_IRQ(unsigned int irq)
                 /*
                  * No locking required for CPU-local interrupts:
                  */
-               if (desc->chip->ack) {
+               if (desc->chip->ack)
                         desc->chip->ack(irq);
-                       /* get new one */
-                       desc = irq_remap_to_desc(irq, desc);
-               }
                 if (likely(!(desc->status & IRQ_DISABLED))) {
                         action_ret = handle_IRQ_event(irq, desc->action);
                         if (!noirqdebug)
@@ -468,10 +469,8 @@ unsigned int __do_IRQ(unsigned int irq)
         }
  
         spin_lock(&desc->lock);
-       if (desc->chip->ack) {
+       if (desc->chip->ack)
                 desc->chip->ack(irq);
-               desc = irq_remap_to_desc(irq, desc);
-       }
         /*
          * REPLAY is when Linux resends an IRQ that was dropped earlier
          * WAITING is used by probe to mark irqs that are being tested
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h

index 01ce20eab38fed96a7b5a861940ba58d47c18fbf..73468253143ba55883072d1020791c11b1e30210 100644 (file)
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
  extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
  
  extern struct lock_class_key irq_desc_lock_class;
-extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
+extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
  extern void clear_kstat_irqs(struct irq_desc *desc);
  extern spinlock_t sparse_irq_lock;
  
@@ -42,6 +42,9 @@ static inline void unregister_handler_proc(unsigned int irq,
  
  extern int irq_select_affinity_usr(unsigned int irq);
  
+extern void
+irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask);
+
  /*
   * Debugging printout:
   */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c

index 2734eca59243bd8b0783b3553622363b2ce8129e..aaf5c9d05770378b42acc2d41905ceb543f1f280 100644 (file)
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -80,7 +80,7 @@ int irq_can_set_affinity(unsigned int irq)
         return 1;
  }
  
-static void
+void
  irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
  {
         struct irqaction *action = desc->action;
@@ -109,17 +109,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
         spin_lock_irqsave(&desc->lock, flags);
  
  #ifdef CONFIG_GENERIC_PENDING_IRQ
-       if (desc->status & IRQ_MOVE_PCNTXT)
-               desc->chip->set_affinity(irq, cpumask);
+       if (desc->status & IRQ_MOVE_PCNTXT) {
+               if (!desc->chip->set_affinity(irq, cpumask)) {
+                       cpumask_copy(desc->affinity, cpumask);
+                       irq_set_thread_affinity(desc, cpumask);
+               }
+       }
         else {
                 desc->status |= IRQ_MOVE_PENDING;
                 cpumask_copy(desc->pending_mask, cpumask);
         }
  #else
-       cpumask_copy(desc->affinity, cpumask);
-       desc->chip->set_affinity(irq, cpumask);
+       if (!desc->chip->set_affinity(irq, cpumask)) {
+               cpumask_copy(desc->affinity, cpumask);
+               irq_set_thread_affinity(desc, cpumask);
+       }
  #endif
-       irq_set_thread_affinity(desc, cpumask);
         desc->status |= IRQ_AFFINITY_SET;
         spin_unlock_irqrestore(&desc->lock, flags);
         return 0;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c

index e05ad9be43b7a5fa7e6d6a24f94e49828a73572a..cfe767ca154501460f9f7e757c08d881611e9c95 100644 (file)
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,5 +1,8 @@
  
  #include <linux/irq.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
  
  void move_masked_irq(int irq)
  {
@@ -39,11 +42,12 @@ void move_masked_irq(int irq)
          * masking the irqs.
          */
         if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
-                  < nr_cpu_ids)) {
-               cpumask_and(desc->affinity,
-                           desc->pending_mask, cpu_online_mask);
-               desc->chip->set_affinity(irq, desc->affinity);
-       }
+                  < nr_cpu_ids))
+               if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
+                       cpumask_copy(desc->affinity, desc->pending_mask);
+                       irq_set_thread_affinity(desc, desc->pending_mask);
+               }
+
         cpumask_clear(desc->pending_mask);
  }
  
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c

index 44bbdcbaf8d2c8193f2d631b54061047546d0065..2f69bee57bf21ca5a42027834ddccee96670d989 100644 (file)
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -15,9 +15,9 @@
  
  static void init_copy_kstat_irqs(struct irq_desc *old_desc,
                                  struct irq_desc *desc,
-                                int cpu, int nr)
+                                int node, int nr)
  {
-       init_kstat_irqs(desc, cpu, nr);
+       init_kstat_irqs(desc, node, nr);
  
         if (desc->kstat_irqs != old_desc->kstat_irqs)
                 memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
@@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
  }
  
  static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
-                struct irq_desc *desc, int cpu)
+                struct irq_desc *desc, int node)
  {
         memcpy(desc, old_desc, sizeof(struct irq_desc));
-       if (!init_alloc_desc_masks(desc, cpu, false)) {
+       if (!alloc_desc_masks(desc, node, false)) {
                 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
                                 "for migration.\n", irq);
                 return false;
         }
         spin_lock_init(&desc->lock);
-       desc->cpu = cpu;
+       desc->node = node;
         lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-       init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+       init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
         init_copy_desc_masks(old_desc, desc);
-       arch_init_copy_chip_data(old_desc, desc, cpu);
+       arch_init_copy_chip_data(old_desc, desc, node);
         return true;
  }
  
@@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
  }
  
  static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
-                                               int cpu)
+                                               int node)
  {
         struct irq_desc *desc;
         unsigned int irq;
         unsigned long flags;
-       int node;
  
         irq = old_desc->irq;
  
@@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
         if (desc && old_desc != desc)
                 goto out_unlock;
  
-       node = cpu_to_node(cpu);
         desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
         if (!desc) {
                 printk(KERN_ERR "irq %d: can not get new irq_desc "
@@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
                 desc = old_desc;
                 goto out_unlock;
         }
-       if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {
+       if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
                 /* still use old one */
                 kfree(desc);
                 desc = old_desc;
@@ -97,9 +95,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
  
         /* free the old one */
         free_one_irq_desc(old_desc, desc);
-       spin_unlock(&old_desc->lock);
         kfree(old_desc);
-       spin_lock(&desc->lock);
  
         return desc;
  
@@ -109,24 +105,14 @@ out_unlock:
         return desc;
  }
  
-struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
  {
-       int old_cpu;
-       int node, old_node;
-
         /* those all static, do move them */
         if (desc->irq < NR_IRQS_LEGACY)
                 return desc;
  
-       old_cpu = desc->cpu;
-       if (old_cpu != cpu) {
-               node = cpu_to_node(cpu);
-               old_node = cpu_to_node(old_cpu);
-               if (old_node != node)
-                       desc = __real_move_irq_desc(desc, cpu);
-               else
-                       desc->cpu = cpu;
-       }
+       if (desc->node != node)
+               desc = __real_move_irq_desc(desc, node);
  
         return desc;
  }
diff --git a/kernel/kthread.c b/kernel/kthread.c

index 4ebaf8519abf64fb0cc4e93ef42d7ad0cf286c9f..41c88fe40500399c76f0382d51a3fb05aef03211 100644 (file)
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -13,7 +13,7 @@
  #include <linux/file.h>
  #include <linux/module.h>
  #include <linux/mutex.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
  
  #define KTHREAD_NICE_LEVEL (-5)
  
@@ -21,9 +21,6 @@ static DEFINE_SPINLOCK(kthread_create_lock);
  static LIST_HEAD(kthread_create_list);
  struct task_struct *kthreadd_task;
  
-DEFINE_TRACE(sched_kthread_stop);
-DEFINE_TRACE(sched_kthread_stop_ret);
-
  struct kthread_create_info
  {
         /* Information passed to kthread() from kthreadd. */
diff --git a/kernel/lockdep.c b/kernel/lockdep.c

index accb40cdb12a4cd0c3d5fd880c706d986568557d..8bbeef996c76598b7e45e3b48186fc18c03b5be0 100644 (file)
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,12 +42,14 @@
  #include <linux/hash.h>
  #include <linux/ftrace.h>
  #include <linux/stringify.h>
-#include <trace/lockdep.h>
  
  #include <asm/sections.h>
  
  #include "lockdep_internals.h"
  
+#define CREATE_TRACE_POINTS
+#include <trace/events/lockdep.h>
+
  #ifdef CONFIG_PROVE_LOCKING
  int prove_locking = 1;
  module_param(prove_locking, int, 0644);
@@ -2935,8 +2937,6 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
  }
  EXPORT_SYMBOL_GPL(lock_set_class);
  
-DEFINE_TRACE(lock_acquire);
-
  /*
   * We are not always called with irqs disabled - do that here,
   * and also avoid lockdep recursion:
@@ -2963,8 +2963,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
  }
  EXPORT_SYMBOL_GPL(lock_acquire);
  
-DEFINE_TRACE(lock_release);
-
  void lock_release(struct lockdep_map *lock, int nested,
                           unsigned long ip)
  {
@@ -3105,6 +3103,8 @@ found_it:
                 hlock->holdtime_stamp = now;
         }
  
+       trace_lock_acquired(lock, ip, waittime);
+
         stats = get_lock_stats(hlock_class(hlock));
         if (waittime) {
                 if (hlock->read)
@@ -3120,8 +3120,6 @@ found_it:
         lock->ip = ip;
  }
  
-DEFINE_TRACE(lock_contended);
-
  void lock_contended(struct lockdep_map *lock, unsigned long ip)
  {
         unsigned long flags;
@@ -3143,14 +3141,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
  }
  EXPORT_SYMBOL_GPL(lock_contended);
  
-DEFINE_TRACE(lock_acquired);
-
  void lock_acquired(struct lockdep_map *lock, unsigned long ip)
  {
         unsigned long flags;
  
-       trace_lock_acquired(lock, ip);
-
         if (unlikely(!lock_stat))
                 return;
  
diff --git a/kernel/module.c b/kernel/module.c

index e797812a4d95f164bb377447a62de3089c0ba182..2383e60fcf3fcb8363d378902169f264454f2914 100644 (file)
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -18,6 +18,7 @@
  */
  #include <linux/module.h>
  #include <linux/moduleloader.h>
+#include <linux/ftrace_event.h>
  #include <linux/init.h>
  #include <linux/kallsyms.h>
  #include <linux/fs.h>
@@ -1489,9 +1490,6 @@ static void free_module(struct module *mod)
         /* Free any allocated parameters. */
         destroy_params(mod->kp, mod->num_kp);
  
-       /* release any pointers to mcount in this module */
-       ftrace_release(mod->module_core, mod->core_size);
-
         /* This may be NULL, but that's OK */
         module_free(mod, mod->module_init);
         kfree(mod->args);
@@ -1892,11 +1890,9 @@ static noinline struct module *load_module(void __user *umod,
         unsigned int symindex = 0;
         unsigned int strindex = 0;
         unsigned int modindex, versindex, infoindex, pcpuindex;
-       unsigned int num_mcount;
         struct module *mod;
         long err = 0;
         void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
-       unsigned long *mseg;
         mm_segment_t old_fs;
  
         DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2172,7 +2168,19 @@ static noinline struct module *load_module(void __user *umod,
                                         sizeof(*mod->tracepoints),
                                         &mod->num_tracepoints);
  #endif
-
+#ifdef CONFIG_EVENT_TRACING
+       mod->trace_events = section_objs(hdr, sechdrs, secstrings,
+                                        "_ftrace_events",
+                                        sizeof(*mod->trace_events),
+                                        &mod->num_trace_events);
+#endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+       /* sechdrs[0].sh_size is always zero */
+       mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
+                                            "__mcount_loc",
+                                            sizeof(*mod->ftrace_callsites),
+                                            &mod->num_ftrace_callsites);
+#endif
  #ifdef CONFIG_MODVERSIONS
         if ((mod->num_syms && !mod->crcs)
             || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2237,11 +2245,6 @@ static noinline struct module *load_module(void __user *umod,
                         dynamic_debug_setup(debug, num_debug);
         }
  
-       /* sechdrs[0].sh_size is always zero */
-       mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
-                           sizeof(*mseg), &num_mcount);
-       ftrace_init_module(mod, mseg, mseg + num_mcount);
-
         err = module_finalize(hdr, sechdrs, mod);
         if (err < 0)
                 goto cleanup;
@@ -2302,7 +2305,6 @@ static noinline struct module *load_module(void __user *umod,
   cleanup:
         kobject_del(&mod->mkobj.kobj);
         kobject_put(&mod->mkobj.kobj);
-       ftrace_release(mod->module_core, mod->core_size);
   free_unload:
         module_unload_free(mod);
  #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
diff --git a/kernel/mutex.c b/kernel/mutex.c

index 507cf2b5e9f1e6328b2e335a3a05c0ead6a0ff3e..947b3ad551f8a925c39ef6f53f4f37c16f8a3ea3 100644 (file)
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
   *
   * This function is similar to (but not equivalent to) down().
   */
-void inline __sched mutex_lock(struct mutex *lock)
+void __sched mutex_lock(struct mutex *lock)
  {
         might_sleep();
         /*
@@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
  
                 /* didnt get the lock, go to sleep: */
                 spin_unlock_mutex(&lock->wait_lock, flags);
-               __schedule();
+               preempt_enable_no_resched();
+               schedule();
+               preempt_disable();
                 spin_lock_mutex(&lock->wait_lock, flags);
         }
  
@@ -471,5 +473,28 @@ int __sched mutex_trylock(struct mutex *lock)
  
         return ret;
  }
-
  EXPORT_SYMBOL(mutex_trylock);
+
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+       /* dec if we can't possibly hit 0 */
+       if (atomic_add_unless(cnt, -1, 1))
+               return 0;
+       /* we might hit 0, so take the lock */
+       mutex_lock(lock);
+       if (!atomic_dec_and_test(cnt)) {
+               /* when we actually did the dec, we didn't hit 0 */
+               mutex_unlock(lock);
+               return 0;
+       }
+       /* we hit 0, and we hold the lock */
+       return 1;
+}
+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c

new file mode 100644 (file)

index 0000000..ef5d8a5
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,4260 @@
+/*
+ * Performance counter core code
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ *  For licensing details see kernel-base/COPYING
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/sysfs.h>
+#include <linux/dcache.h>
+#include <linux/percpu.h>
+#include <linux/ptrace.h>
+#include <linux/vmstat.h>
+#include <linux/hardirq.h>
+#include <linux/rculist.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/anon_inodes.h>
+#include <linux/kernel_stat.h>
+#include <linux/perf_counter.h>
+
+#include <asm/irq_regs.h>
+
+/*
+ * Each CPU has a list of per CPU counters:
+ */
+DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+int perf_max_counters __read_mostly = 1;
+static int perf_reserved_percpu __read_mostly;
+static int perf_overcommit __read_mostly = 1;
+
+static atomic_t nr_counters __read_mostly;
+static atomic_t nr_mmap_counters __read_mostly;
+static atomic_t nr_comm_counters __read_mostly;
+
+/*
+ * perf counter paranoia level:
+ *  0 - not paranoid
+ *  1 - disallow cpu counters to unpriv
+ *  2 - disallow kernel profiling to unpriv
+ */
+int sysctl_perf_counter_paranoid __read_mostly;
+
+static inline bool perf_paranoid_cpu(void)
+{
+       return sysctl_perf_counter_paranoid > 0;
+}
+
+static inline bool perf_paranoid_kernel(void)
+{
+       return sysctl_perf_counter_paranoid > 1;
+}
+
+int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
+
+/*
+ * max perf counter sample rate
+ */
+int sysctl_perf_counter_sample_rate __read_mostly = 100000;
+
+static atomic64_t perf_counter_id;
+
+/*
+ * Lock for (sysadmin-configurable) counter reservations:
+ */
+static DEFINE_SPINLOCK(perf_resource_lock);
+
+/*
+ * Architecture provided APIs - weak aliases:
+ */
+extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
+{
+       return NULL;
+}
+
+void __weak hw_perf_disable(void)              { barrier(); }
+void __weak hw_perf_enable(void)               { barrier(); }
+
+void __weak hw_perf_counter_setup(int cpu)     { barrier(); }
+
+int __weak
+hw_perf_group_sched_in(struct perf_counter *group_leader,
+              struct perf_cpu_context *cpuctx,
+              struct perf_counter_context *ctx, int cpu)
+{
+       return 0;
+}
+
+void __weak perf_counter_print_debug(void)     { }
+
+static DEFINE_PER_CPU(int, disable_count);
+
+void __perf_disable(void)
+{
+       __get_cpu_var(disable_count)++;
+}
+
+bool __perf_enable(void)
+{
+       return !--__get_cpu_var(disable_count);
+}
+
+void perf_disable(void)
+{
+       __perf_disable();
+       hw_perf_disable();
+}
+
+void perf_enable(void)
+{
+       if (__perf_enable())
+               hw_perf_enable();
+}
+
+static void get_ctx(struct perf_counter_context *ctx)
+{
+       atomic_inc(&ctx->refcount);
+}
+
+static void free_ctx(struct rcu_head *head)
+{
+       struct perf_counter_context *ctx;
+
+       ctx = container_of(head, struct perf_counter_context, rcu_head);
+       kfree(ctx);
+}
+
+static void put_ctx(struct perf_counter_context *ctx)
+{
+       if (atomic_dec_and_test(&ctx->refcount)) {
+               if (ctx->parent_ctx)
+                       put_ctx(ctx->parent_ctx);
+               if (ctx->task)
+                       put_task_struct(ctx->task);
+               call_rcu(&ctx->rcu_head, free_ctx);
+       }
+}
+
+/*
+ * Get the perf_counter_context for a task and lock it.
+ * This has to cope with with the fact that until it is locked,
+ * the context could get moved to another task.
+ */
+static struct perf_counter_context *
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
+{
+       struct perf_counter_context *ctx;
+
+       rcu_read_lock();
+ retry:
+       ctx = rcu_dereference(task->perf_counter_ctxp);
+       if (ctx) {
+               /*
+                * If this context is a clone of another, it might
+                * get swapped for another underneath us by
+                * perf_counter_task_sched_out, though the
+                * rcu_read_lock() protects us from any context
+                * getting freed.  Lock the context and check if it
+                * got swapped before we could get the lock, and retry
+                * if so.  If we locked the right context, then it
+                * can't get swapped on us any more.
+                */
+               spin_lock_irqsave(&ctx->lock, *flags);
+               if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
+                       spin_unlock_irqrestore(&ctx->lock, *flags);
+                       goto retry;
+               }
+       }
+       rcu_read_unlock();
+       return ctx;
+}
+
+/*
+ * Get the context for a task and increment its pin_count so it
+ * can't get swapped to another task.  This also increments its
+ * reference count so that the context can't get freed.
+ */
+static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
+{
+       struct perf_counter_context *ctx;
+       unsigned long flags;
+
+       ctx = perf_lock_task_context(task, &flags);
+       if (ctx) {
+               ++ctx->pin_count;
+               get_ctx(ctx);
+               spin_unlock_irqrestore(&ctx->lock, flags);
+       }
+       return ctx;
+}
+
+static void perf_unpin_context(struct perf_counter_context *ctx)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&ctx->lock, flags);
+       --ctx->pin_count;
+       spin_unlock_irqrestore(&ctx->lock, flags);
+       put_ctx(ctx);
+}
+
+/*
+ * Add a counter from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
+{
+       struct perf_counter *group_leader = counter->group_leader;
+
+       /*
+        * Depending on whether it is a standalone or sibling counter,
+        * add it straight to the context's counter list, or to the group
+        * leader's sibling list:
+        */
+       if (group_leader == counter)
+               list_add_tail(&counter->list_entry, &ctx->counter_list);
+       else {
+               list_add_tail(&counter->list_entry, &group_leader->sibling_list);
+               group_leader->nr_siblings++;
+       }
+
+       list_add_rcu(&counter->event_entry, &ctx->event_list);
+       ctx->nr_counters++;
+}
+
+/*
+ * Remove a counter from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
+{
+       struct perf_counter *sibling, *tmp;
+
+       if (list_empty(&counter->list_entry))
+               return;
+       ctx->nr_counters--;
+
+       list_del_init(&counter->list_entry);
+       list_del_rcu(&counter->event_entry);
+
+       if (counter->group_leader != counter)
+               counter->group_leader->nr_siblings--;
+
+       /*
+        * If this was a group counter with sibling counters then
+        * upgrade the siblings to singleton counters by adding them
+        * to the context list directly:
+        */
+       list_for_each_entry_safe(sibling, tmp,
+                                &counter->sibling_list, list_entry) {
+
+               list_move_tail(&sibling->list_entry, &ctx->counter_list);
+               sibling->group_leader = sibling;
+       }
+}
+
+static void
+counter_sched_out(struct perf_counter *counter,
+                 struct perf_cpu_context *cpuctx,
+                 struct perf_counter_context *ctx)
+{
+       if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+               return;
+
+       counter->state = PERF_COUNTER_STATE_INACTIVE;
+       counter->tstamp_stopped = ctx->time;
+       counter->pmu->disable(counter);
+       counter->oncpu = -1;
+
+       if (!is_software_counter(counter))
+               cpuctx->active_oncpu--;
+       ctx->nr_active--;
+       if (counter->attr.exclusive || !cpuctx->active_oncpu)
+               cpuctx->exclusive = 0;
+}
+
+static void
+group_sched_out(struct perf_counter *group_counter,
+               struct perf_cpu_context *cpuctx,
+               struct perf_counter_context *ctx)
+{
+       struct perf_counter *counter;
+
+       if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
+               return;
+
+       counter_sched_out(group_counter, cpuctx, ctx);
+
+       /*
+        * Schedule out siblings (if any):
+        */
+       list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+               counter_sched_out(counter, cpuctx, ctx);
+
+       if (group_counter->attr.exclusive)
+               cpuctx->exclusive = 0;
+}
+
+/*
+ * Cross CPU call to remove a performance counter
+ *
+ * We disable the counter on the hardware level first. After that we
+ * remove it from the context list.
+ */
+static void __perf_counter_remove_from_context(void *info)
+{
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       struct perf_counter *counter = info;
+       struct perf_counter_context *ctx = counter->ctx;
+
+       /*
+        * If this is a task context, we need to check whether it is
+        * the current task context of this cpu. If not it has been
+        * scheduled out before the smp call arrived.
+        */
+       if (ctx->task && cpuctx->task_ctx != ctx)
+               return;
+
+       spin_lock(&ctx->lock);
+       /*
+        * Protect the list operation against NMI by disabling the
+        * counters on a global level.
+        */
+       perf_disable();
+
+       counter_sched_out(counter, cpuctx, ctx);
+
+       list_del_counter(counter, ctx);
+
+       if (!ctx->task) {
+               /*
+                * Allow more per task counters with respect to the
+                * reservation:
+                */
+               cpuctx->max_pertask =
+                       min(perf_max_counters - ctx->nr_counters,
+                           perf_max_counters - perf_reserved_percpu);
+       }
+
+       perf_enable();
+       spin_unlock(&ctx->lock);
+}
+
+
+/*
+ * Remove the counter from a task's (or a CPU's) list of counters.
+ *
+ * Must be called with ctx->mutex held.
+ *
+ * CPU counters are removed with a smp call. For task counters we only
+ * call when the task is on a CPU.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This is OK when called from perf_release since
+ * that only calls us on the top-level context, which can't be a clone.
+ * When called from perf_counter_exit_task, it's OK because the
+ * context has been detached from its task.
+ */
+static void perf_counter_remove_from_context(struct perf_counter *counter)
+{
+       struct perf_counter_context *ctx = counter->ctx;
+       struct task_struct *task = ctx->task;
+
+       if (!task) {
+               /*
+                * Per cpu counters are removed via an smp call and
+                * the removal is always sucessful.
+                */
+               smp_call_function_single(counter->cpu,
+                                        __perf_counter_remove_from_context,
+                                        counter, 1);
+               return;
+       }
+
+retry:
+       task_oncpu_function_call(task, __perf_counter_remove_from_context,
+                                counter);
+
+       spin_lock_irq(&ctx->lock);
+       /*
+        * If the context is active we need to retry the smp call.
+        */
+       if (ctx->nr_active && !list_empty(&counter->list_entry)) {
+               spin_unlock_irq(&ctx->lock);
+               goto retry;
+       }
+
+       /*
+        * The lock prevents that this context is scheduled in so we
+        * can remove the counter safely, if the call above did not
+        * succeed.
+        */
+       if (!list_empty(&counter->list_entry)) {
+               list_del_counter(counter, ctx);
+       }
+       spin_unlock_irq(&ctx->lock);
+}
+
+static inline u64 perf_clock(void)
+{
+       return cpu_clock(smp_processor_id());
+}
+
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_counter_context *ctx)
+{
+       u64 now = perf_clock();
+
+       ctx->time += now - ctx->timestamp;
+       ctx->timestamp = now;
+}
+
+/*
+ * Update the total_time_enabled and total_time_running fields for a counter.
+ */
+static void update_counter_times(struct perf_counter *counter)
+{
+       struct perf_counter_context *ctx = counter->ctx;
+       u64 run_end;
+
+       if (counter->state < PERF_COUNTER_STATE_INACTIVE)
+               return;
+
+       counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
+
+       if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+               run_end = counter->tstamp_stopped;
+       else
+               run_end = ctx->time;
+
+       counter->total_time_running = run_end - counter->tstamp_running;
+}
+
+/*
+ * Update total_time_enabled and total_time_running for all counters in a group.
+ */
+static void update_group_times(struct perf_counter *leader)
+{
+       struct perf_counter *counter;
+
+       update_counter_times(leader);
+       list_for_each_entry(counter, &leader->sibling_list, list_entry)
+               update_counter_times(counter);
+}
+
+/*
+ * Cross CPU call to disable a performance counter
+ */
+static void __perf_counter_disable(void *info)
+{
+       struct perf_counter *counter = info;
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       struct perf_counter_context *ctx = counter->ctx;
+
+       /*
+        * If this is a per-task counter, need to check whether this
+        * counter's task is the current task on this cpu.
+        */
+       if (ctx->task && cpuctx->task_ctx != ctx)
+               return;
+
+       spin_lock(&ctx->lock);
+
+       /*
+        * If the counter is on, turn it off.
+        * If it is in error state, leave it in error state.
+        */
+       if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
+               update_context_time(ctx);
+               update_counter_times(counter);
+               if (counter == counter->group_leader)
+                       group_sched_out(counter, cpuctx, ctx);
+               else
+                       counter_sched_out(counter, cpuctx, ctx);
+               counter->state = PERF_COUNTER_STATE_OFF;
+       }
+
+       spin_unlock(&ctx->lock);
+}
+
+/*
+ * Disable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This condition is satisifed when called through
+ * perf_counter_for_each_child or perf_counter_for_each because they
+ * hold the top-level counter's child_mutex, so any descendant that
+ * goes to exit will block in sync_child_counter.
+ * When called from perf_pending_counter it's OK because counter->ctx
+ * is the current context on this CPU and preemption is disabled,
+ * hence we can't get into perf_counter_task_sched_out for this context.
+ */
+static void perf_counter_disable(struct perf_counter *counter)
+{
+       struct perf_counter_context *ctx = counter->ctx;
+       struct task_struct *task = ctx->task;
+
+       if (!task) {
+               /*
+                * Disable the counter on the cpu that it's on
+                */
+               smp_call_function_single(counter->cpu, __perf_counter_disable,
+                                        counter, 1);
+               return;
+       }
+
+ retry:
+       task_oncpu_function_call(task, __perf_counter_disable, counter);
+
+       spin_lock_irq(&ctx->lock);
+       /*
+        * If the counter is still active, we need to retry the cross-call.
+        */
+       if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
+               spin_unlock_irq(&ctx->lock);
+               goto retry;
+       }
+
+       /*
+        * Since we have the lock this context can't be scheduled
+        * in, so we can change the state safely.
+        */
+       if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+               update_counter_times(counter);
+               counter->state = PERF_COUNTER_STATE_OFF;
+       }
+
+       spin_unlock_irq(&ctx->lock);
+}
+
+static int
+counter_sched_in(struct perf_counter *counter,
+                struct perf_cpu_context *cpuctx,
+                struct perf_counter_context *ctx,
+                int cpu)
+{
+       if (counter->state <= PERF_COUNTER_STATE_OFF)
+               return 0;
+
+       counter->state = PERF_COUNTER_STATE_ACTIVE;
+       counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
+       /*
+        * The new state must be visible before we turn it on in the hardware:
+        */
+       smp_wmb();
+
+       if (counter->pmu->enable(counter)) {
+               counter->state = PERF_COUNTER_STATE_INACTIVE;
+               counter->oncpu = -1;
+               return -EAGAIN;
+       }
+
+       counter->tstamp_running += ctx->time - counter->tstamp_stopped;
+
+       if (!is_software_counter(counter))
+               cpuctx->active_oncpu++;
+       ctx->nr_active++;
+
+       if (counter->attr.exclusive)
+               cpuctx->exclusive = 1;
+
+       return 0;
+}
+
+static int
+group_sched_in(struct perf_counter *group_counter,
+              struct perf_cpu_context *cpuctx,
+              struct perf_counter_context *ctx,
+              int cpu)
+{
+       struct perf_counter *counter, *partial_group;
+       int ret;
+
+       if (group_counter->state == PERF_COUNTER_STATE_OFF)
+               return 0;
+
+       ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
+       if (ret)
+               return ret < 0 ? ret : 0;
+
+       if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
+               return -EAGAIN;
+
+       /*
+        * Schedule in siblings as one group (if any):
+        */
+       list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+               if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
+                       partial_group = counter;
+                       goto group_error;
+               }
+       }
+
+       return 0;
+
+group_error:
+       /*
+        * Groups can be scheduled in as one unit only, so undo any
+        * partial group before returning:
+        */
+       list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+               if (counter == partial_group)
+                       break;
+               counter_sched_out(counter, cpuctx, ctx);
+       }
+       counter_sched_out(group_counter, cpuctx, ctx);
+
+       return -EAGAIN;
+}
+
+/*
+ * Return 1 for a group consisting entirely of software counters,
+ * 0 if the group contains any hardware counters.
+ */
+static int is_software_only_group(struct perf_counter *leader)
+{
+       struct perf_counter *counter;
+
+       if (!is_software_counter(leader))
+               return 0;
+
+       list_for_each_entry(counter, &leader->sibling_list, list_entry)
+               if (!is_software_counter(counter))
+                       return 0;
+
+       return 1;
+}
+
+/*
+ * Work out whether we can put this counter group on the CPU now.
+ */
+static int group_can_go_on(struct perf_counter *counter,
+                          struct perf_cpu_context *cpuctx,
+                          int can_add_hw)
+{
+       /*
+        * Groups consisting entirely of software counters can always go on.
+        */
+       if (is_software_only_group(counter))
+               return 1;
+       /*
+        * If an exclusive group is already on, no other hardware
+        * counters can go on.
+        */
+       if (cpuctx->exclusive)
+               return 0;
+       /*
+        * If this group is exclusive and there are already
+        * counters on the CPU, it can't go on.
+        */
+       if (counter->attr.exclusive && cpuctx->active_oncpu)
+               return 0;
+       /*
+        * Otherwise, try to add it if all previous groups were able
+        * to go on.
+        */
+       return can_add_hw;
+}
+
+static void add_counter_to_ctx(struct perf_counter *counter,
+                              struct perf_counter_context *ctx)
+{
+       list_add_counter(counter, ctx);
+       counter->tstamp_enabled = ctx->time;
+       counter->tstamp_running = ctx->time;
+       counter->tstamp_stopped = ctx->time;
+}
+
+/*
+ * Cross CPU call to install and enable a performance counter
+ *
+ * Must be called with ctx->mutex held
+ */
+static void __perf_install_in_context(void *info)
+{
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       struct perf_counter *counter = info;
+       struct perf_counter_context *ctx = counter->ctx;
+       struct perf_counter *leader = counter->group_leader;
+       int cpu = smp_processor_id();
+       int err;
+
+       /*
+        * If this is a task context, we need to check whether it is
+        * the current task context of this cpu. If not it has been
+        * scheduled out before the smp call arrived.
+        * Or possibly this is the right context but it isn't
+        * on this cpu because it had no counters.
+        */
+       if (ctx->task && cpuctx->task_ctx != ctx) {
+               if (cpuctx->task_ctx || ctx->task != current)
+                       return;
+               cpuctx->task_ctx = ctx;
+       }
+
+       spin_lock(&ctx->lock);
+       ctx->is_active = 1;
+       update_context_time(ctx);
+
+       /*
+        * Protect the list operation against NMI by disabling the
+        * counters on a global level. NOP for non NMI based counters.
+        */
+       perf_disable();
+
+       add_counter_to_ctx(counter, ctx);
+
+       /*
+        * Don't put the counter on if it is disabled or if
+        * it is in a group and the group isn't on.
+        */
+       if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
+           (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
+               goto unlock;
+
+       /*
+        * An exclusive counter can't go on if there are already active
+        * hardware counters, and no hardware counter can go on if there
+        * is already an exclusive counter on.
+        */
+       if (!group_can_go_on(counter, cpuctx, 1))
+               err = -EEXIST;
+       else
+               err = counter_sched_in(counter, cpuctx, ctx, cpu);
+
+       if (err) {
+               /*
+                * This counter couldn't go on.  If it is in a group
+                * then we have to pull the whole group off.
+                * If the counter group is pinned then put it in error state.
+                */
+               if (leader != counter)
+                       group_sched_out(leader, cpuctx, ctx);
+               if (leader->attr.pinned) {
+                       update_group_times(leader);
+                       leader->state = PERF_COUNTER_STATE_ERROR;
+               }
+       }
+
+       if (!err && !ctx->task && cpuctx->max_pertask)
+               cpuctx->max_pertask--;
+
+ unlock:
+       perf_enable();
+
+       spin_unlock(&ctx->lock);
+}
+
+/*
+ * Attach a performance counter to a context
+ *
+ * First we add the counter to the list with the hardware enable bit
+ * in counter->hw_config cleared.
+ *
+ * If the counter is attached to a task which is on a CPU we use a smp
+ * call to enable it in the task context. The task might have been
+ * scheduled away, but we check this in the smp call again.
+ *
+ * Must be called with ctx->mutex held.
+ */
+static void
+perf_install_in_context(struct perf_counter_context *ctx,
+                       struct perf_counter *counter,
+                       int cpu)
+{
+       struct task_struct *task = ctx->task;
+
+       if (!task) {
+               /*
+                * Per cpu counters are installed via an smp call and
+                * the install is always sucessful.
+                */
+               smp_call_function_single(cpu, __perf_install_in_context,
+                                        counter, 1);
+               return;
+       }
+
+retry:
+       task_oncpu_function_call(task, __perf_install_in_context,
+                                counter);
+
+       spin_lock_irq(&ctx->lock);
+       /*
+        * we need to retry the smp call.
+        */
+       if (ctx->is_active && list_empty(&counter->list_entry)) {
+               spin_unlock_irq(&ctx->lock);
+               goto retry;
+       }
+
+       /*
+        * The lock prevents that this context is scheduled in so we
+        * can add the counter safely, if it the call above did not
+        * succeed.
+        */
+       if (list_empty(&counter->list_entry))
+               add_counter_to_ctx(counter, ctx);
+       spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Cross CPU call to enable a performance counter
+ */
+static void __perf_counter_enable(void *info)
+{
+       struct perf_counter *counter = info;
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       struct perf_counter_context *ctx = counter->ctx;
+       struct perf_counter *leader = counter->group_leader;
+       int err;
+
+       /*
+        * If this is a per-task counter, need to check whether this
+        * counter's task is the current task on this cpu.
+        */
+       if (ctx->task && cpuctx->task_ctx != ctx) {
+               if (cpuctx->task_ctx || ctx->task != current)
+                       return;
+               cpuctx->task_ctx = ctx;
+       }
+
+       spin_lock(&ctx->lock);
+       ctx->is_active = 1;
+       update_context_time(ctx);
+
+       if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+               goto unlock;
+       counter->state = PERF_COUNTER_STATE_INACTIVE;
+       counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
+
+       /*
+        * If the counter is in a group and isn't the group leader,
+        * then don't put it on unless the group is on.
+        */
+       if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
+               goto unlock;
+
+       if (!group_can_go_on(counter, cpuctx, 1)) {
+               err = -EEXIST;
+       } else {
+               perf_disable();
+               if (counter == leader)
+                       err = group_sched_in(counter, cpuctx, ctx,
+                                            smp_processor_id());
+               else
+                       err = counter_sched_in(counter, cpuctx, ctx,
+                                              smp_processor_id());
+               perf_enable();
+       }
+
+       if (err) {
+               /*
+                * If this counter can't go on and it's part of a
+                * group, then the whole group has to come off.
+                */
+               if (leader != counter)
+                       group_sched_out(leader, cpuctx, ctx);
+               if (leader->attr.pinned) {
+                       update_group_times(leader);
+                       leader->state = PERF_COUNTER_STATE_ERROR;
+               }
+       }
+
+ unlock:
+       spin_unlock(&ctx->lock);
+}
+
+/*
+ * Enable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This condition is satisfied when called through
+ * perf_counter_for_each_child or perf_counter_for_each as described
+ * for perf_counter_disable.
+ */
+static void perf_counter_enable(struct perf_counter *counter)
+{
+       struct perf_counter_context *ctx = counter->ctx;
+       struct task_struct *task = ctx->task;
+
+       if (!task) {
+               /*
+                * Enable the counter on the cpu that it's on
+                */
+               smp_call_function_single(counter->cpu, __perf_counter_enable,
+                                        counter, 1);
+               return;
+       }
+
+       spin_lock_irq(&ctx->lock);
+       if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+               goto out;
+
+       /*
+        * If the counter is in error state, clear that first.
+        * That way, if we see the counter in error state below, we
+        * know that it has gone back into error state, as distinct
+        * from the task having been scheduled away before the
+        * cross-call arrived.
+        */
+       if (counter->state == PERF_COUNTER_STATE_ERROR)
+               counter->state = PERF_COUNTER_STATE_OFF;
+
+ retry:
+       spin_unlock_irq(&ctx->lock);
+       task_oncpu_function_call(task, __perf_counter_enable, counter);
+
+       spin_lock_irq(&ctx->lock);
+
+       /*
+        * If the context is active and the counter is still off,
+        * we need to retry the cross-call.
+        */
+       if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
+               goto retry;
+
+       /*
+        * Since we have the lock this context can't be scheduled
+        * in, so we can change the state safely.
+        */
+       if (counter->state == PERF_COUNTER_STATE_OFF) {
+               counter->state = PERF_COUNTER_STATE_INACTIVE;
+               counter->tstamp_enabled =
+                       ctx->time - counter->total_time_enabled;
+       }
+ out:
+       spin_unlock_irq(&ctx->lock);
+}
+
+static int perf_counter_refresh(struct perf_counter *counter, int refresh)
+{
+       /*
+        * not supported on inherited counters
+        */
+       if (counter->attr.inherit)
+               return -EINVAL;
+
+       atomic_add(refresh, &counter->event_limit);
+       perf_counter_enable(counter);
+
+       return 0;
+}
+
+void __perf_counter_sched_out(struct perf_counter_context *ctx,
+                             struct perf_cpu_context *cpuctx)
+{
+       struct perf_counter *counter;
+
+       spin_lock(&ctx->lock);
+       ctx->is_active = 0;
+       if (likely(!ctx->nr_counters))
+               goto out;
+       update_context_time(ctx);
+
+       perf_disable();
+       if (ctx->nr_active) {
+               list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+                       if (counter != counter->group_leader)
+                               counter_sched_out(counter, cpuctx, ctx);
+                       else
+                               group_sched_out(counter, cpuctx, ctx);
+               }
+       }
+       perf_enable();
+ out:
+       spin_unlock(&ctx->lock);
+}
+
+/*
+ * Test whether two contexts are equivalent, i.e. whether they
+ * have both been cloned from the same version of the same context
+ * and they both have the same number of enabled counters.
+ * If the number of enabled counters is the same, then the set
+ * of enabled counters should be the same, because these are both
+ * inherited contexts, therefore we can't access individual counters
+ * in them directly with an fd; we can only enable/disable all
+ * counters via prctl, or enable/disable all counters in a family
+ * via ioctl, which will have the same effect on both contexts.
+ */
+static int context_equiv(struct perf_counter_context *ctx1,
+                        struct perf_counter_context *ctx2)
+{
+       return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
+               && ctx1->parent_gen == ctx2->parent_gen
+               && !ctx1->pin_count && !ctx2->pin_count;
+}
+
+/*
+ * Called from scheduler to remove the counters of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each counter and update the counter value in counter->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * not restart the counter.
+ */
+void perf_counter_task_sched_out(struct task_struct *task,
+                                struct task_struct *next, int cpu)
+{
+       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct perf_counter_context *ctx = task->perf_counter_ctxp;
+       struct perf_counter_context *next_ctx;
+       struct perf_counter_context *parent;
+       struct pt_regs *regs;
+       int do_switch = 1;
+
+       regs = task_pt_regs(task);
+       perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
+
+       if (likely(!ctx || !cpuctx->task_ctx))
+               return;
+
+       update_context_time(ctx);
+
+       rcu_read_lock();
+       parent = rcu_dereference(ctx->parent_ctx);
+       next_ctx = next->perf_counter_ctxp;
+       if (parent && next_ctx &&
+           rcu_dereference(next_ctx->parent_ctx) == parent) {
+               /*
+                * Looks like the two contexts are clones, so we might be
+                * able to optimize the context switch.  We lock both
+                * contexts and check that they are clones under the
+                * lock (including re-checking that neither has been
+                * uncloned in the meantime).  It doesn't matter which
+                * order we take the locks because no other cpu could
+                * be trying to lock both of these tasks.
+                */
+               spin_lock(&ctx->lock);
+               spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
+               if (context_equiv(ctx, next_ctx)) {
+                       /*
+                        * XXX do we need a memory barrier of sorts
+                        * wrt to rcu_dereference() of perf_counter_ctxp
+                        */
+                       task->perf_counter_ctxp = next_ctx;
+                       next->perf_counter_ctxp = ctx;
+                       ctx->task = next;
+                       next_ctx->task = task;
+                       do_switch = 0;
+               }
+               spin_unlock(&next_ctx->lock);
+               spin_unlock(&ctx->lock);
+       }
+       rcu_read_unlock();
+
+       if (do_switch) {
+               __perf_counter_sched_out(ctx, cpuctx);
+               cpuctx->task_ctx = NULL;
+       }
+}
+
+/*
+ * Called with IRQs disabled
+ */
+static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
+{
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+
+       if (!cpuctx->task_ctx)
+               return;
+
+       if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+               return;
+
+       __perf_counter_sched_out(ctx, cpuctx);
+       cpuctx->task_ctx = NULL;
+}
+
+/*
+ * Called with IRQs disabled
+ */
+static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
+{
+       __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
+}
+
+static void
+__perf_counter_sched_in(struct perf_counter_context *ctx,
+                       struct perf_cpu_context *cpuctx, int cpu)
+{
+       struct perf_counter *counter;
+       int can_add_hw = 1;
+
+       spin_lock(&ctx->lock);
+       ctx->is_active = 1;
+       if (likely(!ctx->nr_counters))
+               goto out;
+
+       ctx->timestamp = perf_clock();
+
+       perf_disable();
+
+       /*
+        * First go through the list and put on any pinned groups
+        * in order to give them the best chance of going on.
+        */
+       list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+               if (counter->state <= PERF_COUNTER_STATE_OFF ||
+                   !counter->attr.pinned)
+                       continue;
+               if (counter->cpu != -1 && counter->cpu != cpu)
+                       continue;
+
+               if (counter != counter->group_leader)
+                       counter_sched_in(counter, cpuctx, ctx, cpu);
+               else {
+                       if (group_can_go_on(counter, cpuctx, 1))
+                               group_sched_in(counter, cpuctx, ctx, cpu);
+               }
+
+               /*
+                * If this pinned group hasn't been scheduled,
+                * put it in error state.
+                */
+               if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+                       update_group_times(counter);
+                       counter->state = PERF_COUNTER_STATE_ERROR;
+               }
+       }
+
+       list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+               /*
+                * Ignore counters in OFF or ERROR state, and
+                * ignore pinned counters since we did them already.
+                */
+               if (counter->state <= PERF_COUNTER_STATE_OFF ||
+                   counter->attr.pinned)
+                       continue;
+
+               /*
+                * Listen to the 'cpu' scheduling filter constraint
+                * of counters:
+                */
+               if (counter->cpu != -1 && counter->cpu != cpu)
+                       continue;
+
+               if (counter != counter->group_leader) {
+                       if (counter_sched_in(counter, cpuctx, ctx, cpu))
+                               can_add_hw = 0;
+               } else {
+                       if (group_can_go_on(counter, cpuctx, can_add_hw)) {
+                               if (group_sched_in(counter, cpuctx, ctx, cpu))
+                                       can_add_hw = 0;
+                       }
+               }
+       }
+       perf_enable();
+ out:
+       spin_unlock(&ctx->lock);
+}
+
+/*
+ * Called from scheduler to add the counters of the current task
+ * with interrupts disabled.
+ *
+ * We restore the counter value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * keep the counter running.
+ */
+void perf_counter_task_sched_in(struct task_struct *task, int cpu)
+{
+       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct perf_counter_context *ctx = task->perf_counter_ctxp;
+
+       if (likely(!ctx))
+               return;
+       if (cpuctx->task_ctx == ctx)
+               return;
+       __perf_counter_sched_in(ctx, cpuctx, cpu);
+       cpuctx->task_ctx = ctx;
+}
+
+static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
+{
+       struct perf_counter_context *ctx = &cpuctx->ctx;
+
+       __perf_counter_sched_in(ctx, cpuctx, cpu);
+}
+
+#define MAX_INTERRUPTS (~0ULL)
+
+static void perf_log_throttle(struct perf_counter *counter, int enable);
+static void perf_log_period(struct perf_counter *counter, u64 period);
+
+static void perf_adjust_period(struct perf_counter *counter, u64 events)
+{
+       struct hw_perf_counter *hwc = &counter->hw;
+       u64 period, sample_period;
+       s64 delta;
+
+       events *= hwc->sample_period;
+       period = div64_u64(events, counter->attr.sample_freq);
+
+       delta = (s64)(period - hwc->sample_period);
+       delta = (delta + 7) / 8; /* low pass filter */
+
+       sample_period = hwc->sample_period + delta;
+
+       if (!sample_period)
+               sample_period = 1;
+
+       perf_log_period(counter, sample_period);
+
+       hwc->sample_period = sample_period;
+}
+
+static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
+{
+       struct perf_counter *counter;
+       struct hw_perf_counter *hwc;
+       u64 interrupts, freq;
+
+       spin_lock(&ctx->lock);
+       list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+               if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+                       continue;
+
+               hwc = &counter->hw;
+
+               interrupts = hwc->interrupts;
+               hwc->interrupts = 0;
+
+               /*
+                * unthrottle counters on the tick
+                */
+               if (interrupts == MAX_INTERRUPTS) {
+                       perf_log_throttle(counter, 1);
+                       counter->pmu->unthrottle(counter);
+                       interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
+               }
+
+               if (!counter->attr.freq || !counter->attr.sample_freq)
+                       continue;
+
+               /*
+                * if the specified freq < HZ then we need to skip ticks
+                */
+               if (counter->attr.sample_freq < HZ) {
+                       freq = counter->attr.sample_freq;
+
+                       hwc->freq_count += freq;
+                       hwc->freq_interrupts += interrupts;
+
+                       if (hwc->freq_count < HZ)
+                               continue;
+
+                       interrupts = hwc->freq_interrupts;
+                       hwc->freq_interrupts = 0;
+                       hwc->freq_count -= HZ;
+               } else
+                       freq = HZ;
+
+               perf_adjust_period(counter, freq * interrupts);
+
+               /*
+                * In order to avoid being stalled by an (accidental) huge
+                * sample period, force reset the sample period if we didn't
+                * get any events in this freq period.
+                */
+               if (!interrupts) {
+                       perf_disable();
+                       counter->pmu->disable(counter);
+                       atomic_set(&hwc->period_left, 0);
+                       counter->pmu->enable(counter);
+                       perf_enable();
+               }
+       }
+       spin_unlock(&ctx->lock);
+}
+
+/*
+ * Round-robin a context's counters:
+ */
+static void rotate_ctx(struct perf_counter_context *ctx)
+{
+       struct perf_counter *counter;
+
+       if (!ctx->nr_counters)
+               return;
+
+       spin_lock(&ctx->lock);
+       /*
+        * Rotate the first entry last (works just fine for group counters too):
+        */
+       perf_disable();
+       list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+               list_move_tail(&counter->list_entry, &ctx->counter_list);
+               break;
+       }
+       perf_enable();
+
+       spin_unlock(&ctx->lock);
+}
+
+void perf_counter_task_tick(struct task_struct *curr, int cpu)
+{
+       struct perf_cpu_context *cpuctx;
+       struct perf_counter_context *ctx;
+
+       if (!atomic_read(&nr_counters))
+               return;
+
+       cpuctx = &per_cpu(perf_cpu_context, cpu);
+       ctx = curr->perf_counter_ctxp;
+
+       perf_ctx_adjust_freq(&cpuctx->ctx);
+       if (ctx)
+               perf_ctx_adjust_freq(ctx);
+
+       perf_counter_cpu_sched_out(cpuctx);
+       if (ctx)
+               __perf_counter_task_sched_out(ctx);
+
+       rotate_ctx(&cpuctx->ctx);
+       if (ctx)
+               rotate_ctx(ctx);
+
+       perf_counter_cpu_sched_in(cpuctx, cpu);
+       if (ctx)
+               perf_counter_task_sched_in(curr, cpu);
+}
+
+/*
+ * Cross CPU call to read the hardware counter
+ */
+static void __read(void *info)
+{
+       struct perf_counter *counter = info;
+       struct perf_counter_context *ctx = counter->ctx;
+       unsigned long flags;
+
+       local_irq_save(flags);
+       if (ctx->is_active)
+               update_context_time(ctx);
+       counter->pmu->read(counter);
+       update_counter_times(counter);
+       local_irq_restore(flags);
+}
+
+static u64 perf_counter_read(struct perf_counter *counter)
+{
+       /*
+        * If counter is enabled and currently active on a CPU, update the
+        * value in the counter structure:
+        */
+       if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
+               smp_call_function_single(counter->oncpu,
+                                        __read, counter, 1);
+       } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+               update_counter_times(counter);
+       }
+
+       return atomic64_read(&counter->count);
+}
+
+/*
+ * Initialize the perf_counter context in a task_struct:
+ */
+static void
+__perf_counter_init_context(struct perf_counter_context *ctx,
+                           struct task_struct *task)
+{
+       memset(ctx, 0, sizeof(*ctx));
+       spin_lock_init(&ctx->lock);
+       mutex_init(&ctx->mutex);
+       INIT_LIST_HEAD(&ctx->counter_list);
+       INIT_LIST_HEAD(&ctx->event_list);
+       atomic_set(&ctx->refcount, 1);
+       ctx->task = task;
+}
+
+static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
+{
+       struct perf_counter_context *parent_ctx;
+       struct perf_counter_context *ctx;
+       struct perf_cpu_context *cpuctx;
+       struct task_struct *task;
+       unsigned long flags;
+       int err;
+
+       /*
+        * If cpu is not a wildcard then this is a percpu counter:
+        */
+       if (cpu != -1) {
+               /* Must be root to operate on a CPU counter: */
+               if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+                       return ERR_PTR(-EACCES);
+
+               if (cpu < 0 || cpu > num_possible_cpus())
+                       return ERR_PTR(-EINVAL);
+
+               /*
+                * We could be clever and allow to attach a counter to an
+                * offline CPU and activate it when the CPU comes up, but
+                * that's for later.
+                */
+               if (!cpu_isset(cpu, cpu_online_map))
+                       return ERR_PTR(-ENODEV);
+
+               cpuctx = &per_cpu(perf_cpu_context, cpu);
+               ctx = &cpuctx->ctx;
+               get_ctx(ctx);
+
+               return ctx;
+       }
+
+       rcu_read_lock();
+       if (!pid)
+               task = current;
+       else
+               task = find_task_by_vpid(pid);
+       if (task)
+               get_task_struct(task);
+       rcu_read_unlock();
+
+       if (!task)
+               return ERR_PTR(-ESRCH);
+
+       /*
+        * Can't attach counters to a dying task.
+        */
+       err = -ESRCH;
+       if (task->flags & PF_EXITING)
+               goto errout;
+
+       /* Reuse ptrace permission checks for now. */
+       err = -EACCES;
+       if (!ptrace_may_access(task, PTRACE_MODE_READ))
+               goto errout;
+
+ retry:
+       ctx = perf_lock_task_context(task, &flags);
+       if (ctx) {
+               parent_ctx = ctx->parent_ctx;
+               if (parent_ctx) {
+                       put_ctx(parent_ctx);
+                       ctx->parent_ctx = NULL;         /* no longer a clone */
+               }
+               /*
+                * Get an extra reference before dropping the lock so that
+                * this context won't get freed if the task exits.
+                */
+               get_ctx(ctx);
+               spin_unlock_irqrestore(&ctx->lock, flags);
+       }
+
+       if (!ctx) {
+               ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
+               err = -ENOMEM;
+               if (!ctx)
+                       goto errout;
+               __perf_counter_init_context(ctx, task);
+               get_ctx(ctx);
+               if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
+                       /*
+                        * We raced with some other task; use
+                        * the context they set.
+                        */
+                       kfree(ctx);
+                       goto retry;
+               }
+               get_task_struct(task);
+       }
+
+       put_task_struct(task);
+       return ctx;
+
+ errout:
+       put_task_struct(task);
+       return ERR_PTR(err);
+}
+
+static void free_counter_rcu(struct rcu_head *head)
+{
+       struct perf_counter *counter;
+
+       counter = container_of(head, struct perf_counter, rcu_head);
+       if (counter->ns)
+               put_pid_ns(counter->ns);
+       kfree(counter);
+}
+
+static void perf_pending_sync(struct perf_counter *counter);
+
+static void free_counter(struct perf_counter *counter)
+{
+       perf_pending_sync(counter);
+
+       atomic_dec(&nr_counters);
+       if (counter->attr.mmap)
+               atomic_dec(&nr_mmap_counters);
+       if (counter->attr.comm)
+               atomic_dec(&nr_comm_counters);
+
+       if (counter->destroy)
+               counter->destroy(counter);
+
+       put_ctx(counter->ctx);
+       call_rcu(&counter->rcu_head, free_counter_rcu);
+}
+
+/*
+ * Called when the last reference to the file is gone.
+ */
+static int perf_release(struct inode *inode, struct file *file)
+{
+       struct perf_counter *counter = file->private_data;
+       struct perf_counter_context *ctx = counter->ctx;
+
+       file->private_data = NULL;
+
+       WARN_ON_ONCE(ctx->parent_ctx);
+       mutex_lock(&ctx->mutex);
+       perf_counter_remove_from_context(counter);
+       mutex_unlock(&ctx->mutex);
+
+       mutex_lock(&counter->owner->perf_counter_mutex);
+       list_del_init(&counter->owner_entry);
+       mutex_unlock(&counter->owner->perf_counter_mutex);
+       put_task_struct(counter->owner);
+
+       free_counter(counter);
+
+       return 0;
+}
+
+/*
+ * Read the performance counter - simple non blocking version for now
+ */
+static ssize_t
+perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
+{
+       u64 values[3];
+       int n;
+
+       /*
+        * Return end-of-file for a read on a counter that is in
+        * error state (i.e. because it was pinned but it couldn't be
+        * scheduled on to the CPU at some point).
+        */
+       if (counter->state == PERF_COUNTER_STATE_ERROR)
+               return 0;
+
+       WARN_ON_ONCE(counter->ctx->parent_ctx);
+       mutex_lock(&counter->child_mutex);
+       values[0] = perf_counter_read(counter);
+       n = 1;
+       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+               values[n++] = counter->total_time_enabled +
+                       atomic64_read(&counter->child_total_time_enabled);
+       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+               values[n++] = counter->total_time_running +
+                       atomic64_read(&counter->child_total_time_running);
+       if (counter->attr.read_format & PERF_FORMAT_ID)
+               values[n++] = counter->id;
+       mutex_unlock(&counter->child_mutex);
+
+       if (count < n * sizeof(u64))
+               return -EINVAL;
+       count = n * sizeof(u64);
+
+       if (copy_to_user(buf, values, count))
+               return -EFAULT;
+
+       return count;
+}
+
+static ssize_t
+perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+       struct perf_counter *counter = file->private_data;
+
+       return perf_read_hw(counter, buf, count);
+}
+
+static unsigned int perf_poll(struct file *file, poll_table *wait)
+{
+       struct perf_counter *counter = file->private_data;
+       struct perf_mmap_data *data;
+       unsigned int events = POLL_HUP;
+
+       rcu_read_lock();
+       data = rcu_dereference(counter->data);
+       if (data)
+               events = atomic_xchg(&data->poll, 0);
+       rcu_read_unlock();
+
+       poll_wait(file, &counter->waitq, wait);
+
+       return events;
+}
+
+static void perf_counter_reset(struct perf_counter *counter)
+{
+       (void)perf_counter_read(counter);
+       atomic64_set(&counter->count, 0);
+       perf_counter_update_userpage(counter);
+}
+
+static void perf_counter_for_each_sibling(struct perf_counter *counter,
+                                         void (*func)(struct perf_counter *))
+{
+       struct perf_counter_context *ctx = counter->ctx;
+       struct perf_counter *sibling;
+
+       WARN_ON_ONCE(ctx->parent_ctx);
+       mutex_lock(&ctx->mutex);
+       counter = counter->group_leader;
+
+       func(counter);
+       list_for_each_entry(sibling, &counter->sibling_list, list_entry)
+               func(sibling);
+       mutex_unlock(&ctx->mutex);
+}
+
+/*
+ * Holding the top-level counter's child_mutex means that any
+ * descendant process that has inherited this counter will block
+ * in sync_child_counter if it goes to exit, thus satisfying the
+ * task existence requirements of perf_counter_enable/disable.
+ */
+static void perf_counter_for_each_child(struct perf_counter *counter,
+                                       void (*func)(struct perf_counter *))
+{
+       struct perf_counter *child;
+
+       WARN_ON_ONCE(counter->ctx->parent_ctx);
+       mutex_lock(&counter->child_mutex);
+       func(counter);
+       list_for_each_entry(child, &counter->child_list, child_list)
+               func(child);
+       mutex_unlock(&counter->child_mutex);
+}
+
+static void perf_counter_for_each(struct perf_counter *counter,
+                                 void (*func)(struct perf_counter *))
+{
+       struct perf_counter *child;
+
+       WARN_ON_ONCE(counter->ctx->parent_ctx);
+       mutex_lock(&counter->child_mutex);
+       perf_counter_for_each_sibling(counter, func);
+       list_for_each_entry(child, &counter->child_list, child_list)
+               perf_counter_for_each_sibling(child, func);
+       mutex_unlock(&counter->child_mutex);
+}
+
+static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
+{
+       struct perf_counter_context *ctx = counter->ctx;
+       unsigned long size;
+       int ret = 0;
+       u64 value;
+
+       if (!counter->attr.sample_period)
+               return -EINVAL;
+
+       size = copy_from_user(&value, arg, sizeof(value));
+       if (size != sizeof(value))
+               return -EFAULT;
+
+       if (!value)
+               return -EINVAL;
+
+       spin_lock_irq(&ctx->lock);
+       if (counter->attr.freq) {
+               if (value > sysctl_perf_counter_sample_rate) {
+                       ret = -EINVAL;
+                       goto unlock;
+               }
+
+               counter->attr.sample_freq = value;
+       } else {
+               perf_log_period(counter, value);
+
+               counter->attr.sample_period = value;
+               counter->hw.sample_period = value;
+       }
+unlock:
+       spin_unlock_irq(&ctx->lock);
+
+       return ret;
+}
+
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+       struct perf_counter *counter = file->private_data;
+       void (*func)(struct perf_counter *);
+       u32 flags = arg;
+
+       switch (cmd) {
+       case PERF_COUNTER_IOC_ENABLE:
+               func = perf_counter_enable;
+               break;
+       case PERF_COUNTER_IOC_DISABLE:
+               func = perf_counter_disable;
+               break;
+       case PERF_COUNTER_IOC_RESET:
+               func = perf_counter_reset;
+               break;
+
+       case PERF_COUNTER_IOC_REFRESH:
+               return perf_counter_refresh(counter, arg);
+
+       case PERF_COUNTER_IOC_PERIOD:
+               return perf_counter_period(counter, (u64 __user *)arg);
+
+       default:
+               return -ENOTTY;
+       }
+
+       if (flags & PERF_IOC_FLAG_GROUP)
+               perf_counter_for_each(counter, func);
+       else
+               perf_counter_for_each_child(counter, func);
+
+       return 0;
+}
+
+int perf_counter_task_enable(void)
+{
+       struct perf_counter *counter;
+
+       mutex_lock(&current->perf_counter_mutex);
+       list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+               perf_counter_for_each_child(counter, perf_counter_enable);
+       mutex_unlock(&current->perf_counter_mutex);
+
+       return 0;
+}
+
+int perf_counter_task_disable(void)
+{
+       struct perf_counter *counter;
+
+       mutex_lock(&current->perf_counter_mutex);
+       list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+               perf_counter_for_each_child(counter, perf_counter_disable);
+       mutex_unlock(&current->perf_counter_mutex);
+
+       return 0;
+}
+
+/*
+ * Callers need to ensure there can be no nesting of this function, otherwise
+ * the seqlock logic goes bad. We can not serialize this because the arch
+ * code calls this from NMI context.
+ */
+void perf_counter_update_userpage(struct perf_counter *counter)
+{
+       struct perf_counter_mmap_page *userpg;
+       struct perf_mmap_data *data;
+
+       rcu_read_lock();
+       data = rcu_dereference(counter->data);
+       if (!data)
+               goto unlock;
+
+       userpg = data->user_page;
+
+       /*
+        * Disable preemption so as to not let the corresponding user-space
+        * spin too long if we get preempted.
+        */
+       preempt_disable();
+       ++userpg->lock;
+       barrier();
+       userpg->index = counter->hw.idx;
+       userpg->offset = atomic64_read(&counter->count);
+       if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+               userpg->offset -= atomic64_read(&counter->hw.prev_count);
+
+       barrier();
+       ++userpg->lock;
+       preempt_enable();
+unlock:
+       rcu_read_unlock();
+}
+
+static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct perf_counter *counter = vma->vm_file->private_data;
+       struct perf_mmap_data *data;
+       int ret = VM_FAULT_SIGBUS;
+
+       rcu_read_lock();
+       data = rcu_dereference(counter->data);
+       if (!data)
+               goto unlock;
+
+       if (vmf->pgoff == 0) {
+               vmf->page = virt_to_page(data->user_page);
+       } else {
+               int nr = vmf->pgoff - 1;
+
+               if ((unsigned)nr > data->nr_pages)
+                       goto unlock;
+
+               vmf->page = virt_to_page(data->data_pages[nr]);
+       }
+       get_page(vmf->page);
+       ret = 0;
+unlock:
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
+{
+       struct perf_mmap_data *data;
+       unsigned long size;
+       int i;
+
+       WARN_ON(atomic_read(&counter->mmap_count));
+
+       size = sizeof(struct perf_mmap_data);
+       size += nr_pages * sizeof(void *);
+
+       data = kzalloc(size, GFP_KERNEL);
+       if (!data)
+               goto fail;
+
+       data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!data->user_page)
+               goto fail_user_page;
+
+       for (i = 0; i < nr_pages; i++) {
+               data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+               if (!data->data_pages[i])
+                       goto fail_data_pages;
+       }
+
+       data->nr_pages = nr_pages;
+       atomic_set(&data->lock, -1);
+
+       rcu_assign_pointer(counter->data, data);
+
+       return 0;
+
+fail_data_pages:
+       for (i--; i >= 0; i--)
+               free_page((unsigned long)data->data_pages[i]);
+
+       free_page((unsigned long)data->user_page);
+
+fail_user_page:
+       kfree(data);
+
+fail:
+       return -ENOMEM;
+}
+
+static void __perf_mmap_data_free(struct rcu_head *rcu_head)
+{
+       struct perf_mmap_data *data;
+       int i;
+
+       data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
+
+       free_page((unsigned long)data->user_page);
+       for (i = 0; i < data->nr_pages; i++)
+               free_page((unsigned long)data->data_pages[i]);
+       kfree(data);
+}
+
+static void perf_mmap_data_free(struct perf_counter *counter)
+{
+       struct perf_mmap_data *data = counter->data;
+
+       WARN_ON(atomic_read(&counter->mmap_count));
+
+       rcu_assign_pointer(counter->data, NULL);
+       call_rcu(&data->rcu_head, __perf_mmap_data_free);
+}
+
+static void perf_mmap_open(struct vm_area_struct *vma)
+{
+       struct perf_counter *counter = vma->vm_file->private_data;
+
+       atomic_inc(&counter->mmap_count);
+}
+
+static void perf_mmap_close(struct vm_area_struct *vma)
+{
+       struct perf_counter *counter = vma->vm_file->private_data;
+
+       WARN_ON_ONCE(counter->ctx->parent_ctx);
+       if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
+               struct user_struct *user = current_user();
+
+               atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
+               vma->vm_mm->locked_vm -= counter->data->nr_locked;
+               perf_mmap_data_free(counter);
+               mutex_unlock(&counter->mmap_mutex);
+       }
+}
+
+static struct vm_operations_struct perf_mmap_vmops = {
+       .open  = perf_mmap_open,
+       .close = perf_mmap_close,
+       .fault = perf_mmap_fault,
+};
+
+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       struct perf_counter *counter = file->private_data;
+       unsigned long user_locked, user_lock_limit;
+       struct user_struct *user = current_user();
+       unsigned long locked, lock_limit;
+       unsigned long vma_size;
+       unsigned long nr_pages;
+       long user_extra, extra;
+       int ret = 0;
+
+       if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
+               return -EINVAL;
+
+       vma_size = vma->vm_end - vma->vm_start;
+       nr_pages = (vma_size / PAGE_SIZE) - 1;
+
+       /*
+        * If we have data pages ensure they're a power-of-two number, so we
+        * can do bitmasks instead of modulo.
+        */
+       if (nr_pages != 0 && !is_power_of_2(nr_pages))
+               return -EINVAL;
+
+       if (vma_size != PAGE_SIZE * (1 + nr_pages))
+               return -EINVAL;
+
+       if (vma->vm_pgoff != 0)
+               return -EINVAL;
+
+       WARN_ON_ONCE(counter->ctx->parent_ctx);
+       mutex_lock(&counter->mmap_mutex);
+       if (atomic_inc_not_zero(&counter->mmap_count)) {
+               if (nr_pages != counter->data->nr_pages)
+                       ret = -EINVAL;
+               goto unlock;
+       }
+
+       user_extra = nr_pages + 1;
+       user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
+
+       /*
+        * Increase the limit linearly with more CPUs:
+        */
+       user_lock_limit *= num_online_cpus();
+
+       user_locked = atomic_long_read(&user->locked_vm) + user_extra;
+
+       extra = 0;
+       if (user_locked > user_lock_limit)
+               extra = user_locked - user_lock_limit;
+
+       lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+       lock_limit >>= PAGE_SHIFT;
+       locked = vma->vm_mm->locked_vm + extra;
+
+       if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+               ret = -EPERM;
+               goto unlock;
+       }
+
+       WARN_ON(counter->data);
+       ret = perf_mmap_data_alloc(counter, nr_pages);
+       if (ret)
+               goto unlock;
+
+       atomic_set(&counter->mmap_count, 1);
+       atomic_long_add(user_extra, &user->locked_vm);
+       vma->vm_mm->locked_vm += extra;
+       counter->data->nr_locked = extra;
+unlock:
+       mutex_unlock(&counter->mmap_mutex);
+
+       vma->vm_flags &= ~VM_MAYWRITE;
+       vma->vm_flags |= VM_RESERVED;
+       vma->vm_ops = &perf_mmap_vmops;
+
+       return ret;
+}
+
+static int perf_fasync(int fd, struct file *filp, int on)
+{
+       struct inode *inode = filp->f_path.dentry->d_inode;
+       struct perf_counter *counter = filp->private_data;
+       int retval;
+
+       mutex_lock(&inode->i_mutex);
+       retval = fasync_helper(fd, filp, on, &counter->fasync);
+       mutex_unlock(&inode->i_mutex);
+
+       if (retval < 0)
+               return retval;
+
+       return 0;
+}
+
+static const struct file_operations perf_fops = {
+       .release                = perf_release,
+       .read                   = perf_read,
+       .poll                   = perf_poll,
+       .unlocked_ioctl         = perf_ioctl,
+       .compat_ioctl           = perf_ioctl,
+       .mmap                   = perf_mmap,
+       .fasync                 = perf_fasync,
+};
+
+/*
+ * Perf counter wakeup
+ *
+ * If there's data, ensure we set the poll() state and publish everything
+ * to user-space before waking everybody up.
+ */
+
+void perf_counter_wakeup(struct perf_counter *counter)
+{
+       wake_up_all(&counter->waitq);
+
+       if (counter->pending_kill) {
+               kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
+               counter->pending_kill = 0;
+       }
+}
+
+/*
+ * Pending wakeups
+ *
+ * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
+ *
+ * The NMI bit means we cannot possibly take locks. Therefore, maintain a
+ * single linked list and use cmpxchg() to add entries lockless.
+ */
+
+static void perf_pending_counter(struct perf_pending_entry *entry)
+{
+       struct perf_counter *counter = container_of(entry,
+                       struct perf_counter, pending);
+
+       if (counter->pending_disable) {
+               counter->pending_disable = 0;
+               perf_counter_disable(counter);
+       }
+
+       if (counter->pending_wakeup) {
+               counter->pending_wakeup = 0;
+               perf_counter_wakeup(counter);
+       }
+}
+
+#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
+
+static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
+       PENDING_TAIL,
+};
+
+static void perf_pending_queue(struct perf_pending_entry *entry,
+                              void (*func)(struct perf_pending_entry *))
+{
+       struct perf_pending_entry **head;
+
+       if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
+               return;
+
+       entry->func = func;
+
+       head = &get_cpu_var(perf_pending_head);
+
+       do {
+               entry->next = *head;
+       } while (cmpxchg(head, entry->next, entry) != entry->next);
+
+       set_perf_counter_pending();
+
+       put_cpu_var(perf_pending_head);
+}
+
+static int __perf_pending_run(void)
+{
+       struct perf_pending_entry *list;
+       int nr = 0;
+
+       list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
+       while (list != PENDING_TAIL) {
+               void (*func)(struct perf_pending_entry *);
+               struct perf_pending_entry *entry = list;
+
+               list = list->next;
+
+               func = entry->func;
+               entry->next = NULL;
+               /*
+                * Ensure we observe the unqueue before we issue the wakeup,
+                * so that we won't be waiting forever.
+                * -- see perf_not_pending().
+                */
+               smp_wmb();
+
+               func(entry);
+               nr++;
+       }
+
+       return nr;
+}
+
+static inline int perf_not_pending(struct perf_counter *counter)
+{
+       /*
+        * If we flush on whatever cpu we run, there is a chance we don't
+        * need to wait.
+        */
+       get_cpu();
+       __perf_pending_run();
+       put_cpu();
+
+       /*
+        * Ensure we see the proper queue state before going to sleep
+        * so that we do not miss the wakeup. -- see perf_pending_handle()
+        */
+       smp_rmb();
+       return counter->pending.next == NULL;
+}
+
+static void perf_pending_sync(struct perf_counter *counter)
+{
+       wait_event(counter->waitq, perf_not_pending(counter));
+}
+
+void perf_counter_do_pending(void)
+{
+       __perf_pending_run();
+}
+
+/*
+ * Callchain support -- arch specific
+ */
+
+__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+       return NULL;
+}
+
+/*
+ * Output
+ */
+
+struct perf_output_handle {
+       struct perf_counter     *counter;
+       struct perf_mmap_data   *data;
+       unsigned long           head;
+       unsigned long           offset;
+       int                     nmi;
+       int                     overflow;
+       int                     locked;
+       unsigned long           flags;
+};
+
+static void perf_output_wakeup(struct perf_output_handle *handle)
+{
+       atomic_set(&handle->data->poll, POLL_IN);
+
+       if (handle->nmi) {
+               handle->counter->pending_wakeup = 1;
+               perf_pending_queue(&handle->counter->pending,
+                                  perf_pending_counter);
+       } else
+               perf_counter_wakeup(handle->counter);
+}
+
+/*
+ * Curious locking construct.
+ *
+ * We need to ensure a later event doesn't publish a head when a former
+ * event isn't done writing. However since we need to deal with NMIs we
+ * cannot fully serialize things.
+ *
+ * What we do is serialize between CPUs so we only have to deal with NMI
+ * nesting on a single CPU.
+ *
+ * We only publish the head (and generate a wakeup) when the outer-most
+ * event completes.
+ */
+static void perf_output_lock(struct perf_output_handle *handle)
+{
+       struct perf_mmap_data *data = handle->data;
+       int cpu;
+
+       handle->locked = 0;
+
+       local_irq_save(handle->flags);
+       cpu = smp_processor_id();
+
+       if (in_nmi() && atomic_read(&data->lock) == cpu)
+               return;
+
+       while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
+               cpu_relax();
+
+       handle->locked = 1;
+}
+
+static void perf_output_unlock(struct perf_output_handle *handle)
+{
+       struct perf_mmap_data *data = handle->data;
+       unsigned long head;
+       int cpu;
+
+       data->done_head = data->head;
+
+       if (!handle->locked)
+               goto out;
+
+again:
+       /*
+        * The xchg implies a full barrier that ensures all writes are done
+        * before we publish the new head, matched by a rmb() in userspace when
+        * reading this position.
+        */
+       while ((head = atomic_long_xchg(&data->done_head, 0)))
+               data->user_page->data_head = head;
+
+       /*
+        * NMI can happen here, which means we can miss a done_head update.
+        */
+
+       cpu = atomic_xchg(&data->lock, -1);
+       WARN_ON_ONCE(cpu != smp_processor_id());
+
+       /*
+        * Therefore we have to validate we did not indeed do so.
+        */
+       if (unlikely(atomic_long_read(&data->done_head))) {
+               /*
+                * Since we had it locked, we can lock it again.
+                */
+               while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
+                       cpu_relax();
+
+               goto again;
+       }
+
+       if (atomic_xchg(&data->wakeup, 0))
+               perf_output_wakeup(handle);
+out:
+       local_irq_restore(handle->flags);
+}
+
+static int perf_output_begin(struct perf_output_handle *handle,
+                            struct perf_counter *counter, unsigned int size,
+                            int nmi, int overflow)
+{
+       struct perf_mmap_data *data;
+       unsigned int offset, head;
+
+       /*
+        * For inherited counters we send all the output towards the parent.
+        */
+       if (counter->parent)
+               counter = counter->parent;
+
+       rcu_read_lock();
+       data = rcu_dereference(counter->data);
+       if (!data)
+               goto out;
+
+       handle->data     = data;
+       handle->counter  = counter;
+       handle->nmi      = nmi;
+       handle->overflow = overflow;
+
+       if (!data->nr_pages)
+               goto fail;
+
+       perf_output_lock(handle);
+
+       do {
+               offset = head = atomic_long_read(&data->head);
+               head += size;
+       } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
+
+       handle->offset  = offset;
+       handle->head    = head;
+
+       if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
+               atomic_set(&data->wakeup, 1);
+
+       return 0;
+
+fail:
+       perf_output_wakeup(handle);
+out:
+       rcu_read_unlock();
+
+       return -ENOSPC;
+}
+
+static void perf_output_copy(struct perf_output_handle *handle,
+                            const void *buf, unsigned int len)
+{
+       unsigned int pages_mask;
+       unsigned int offset;
+       unsigned int size;
+       void **pages;
+
+       offset          = handle->offset;
+       pages_mask      = handle->data->nr_pages - 1;
+       pages           = handle->data->data_pages;
+
+       do {
+               unsigned int page_offset;
+               int nr;
+
+               nr          = (offset >> PAGE_SHIFT) & pages_mask;
+               page_offset = offset & (PAGE_SIZE - 1);
+               size        = min_t(unsigned int, PAGE_SIZE - page_offset, len);
+
+               memcpy(pages[nr] + page_offset, buf, size);
+
+               len         -= size;
+               buf         += size;
+               offset      += size;
+       } while (len);
+
+       handle->offset = offset;
+
+       /*
+        * Check we didn't copy past our reservation window, taking the
+        * possible unsigned int wrap into account.
+        */
+       WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
+}
+
+#define perf_output_put(handle, x) \
+       perf_output_copy((handle), &(x), sizeof(x))
+
+static void perf_output_end(struct perf_output_handle *handle)
+{
+       struct perf_counter *counter = handle->counter;
+       struct perf_mmap_data *data = handle->data;
+
+       int wakeup_events = counter->attr.wakeup_events;
+
+       if (handle->overflow && wakeup_events) {
+               int events = atomic_inc_return(&data->events);
+               if (events >= wakeup_events) {
+                       atomic_sub(wakeup_events, &data->events);
+                       atomic_set(&data->wakeup, 1);
+               }
+       }
+
+       perf_output_unlock(handle);
+       rcu_read_unlock();
+}
+
+static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
+{
+       /*
+        * only top level counters have the pid namespace they were created in
+        */
+       if (counter->parent)
+               counter = counter->parent;
+
+       return task_tgid_nr_ns(p, counter->ns);
+}
+
+static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
+{
+       /*
+        * only top level counters have the pid namespace they were created in
+        */
+       if (counter->parent)
+               counter = counter->parent;
+
+       return task_pid_nr_ns(p, counter->ns);
+}
+
+static void perf_counter_output(struct perf_counter *counter, int nmi,
+                               struct perf_sample_data *data)
+{
+       int ret;
+       u64 sample_type = counter->attr.sample_type;
+       struct perf_output_handle handle;
+       struct perf_event_header header;
+       u64 ip;
+       struct {
+               u32 pid, tid;
+       } tid_entry;
+       struct {
+               u64 id;
+               u64 counter;
+       } group_entry;
+       struct perf_callchain_entry *callchain = NULL;
+       int callchain_size = 0;
+       u64 time;
+       struct {
+               u32 cpu, reserved;
+       } cpu_entry;
+
+       header.type = 0;
+       header.size = sizeof(header);
+
+       header.misc = PERF_EVENT_MISC_OVERFLOW;
+       header.misc |= perf_misc_flags(data->regs);
+
+       if (sample_type & PERF_SAMPLE_IP) {
+               ip = perf_instruction_pointer(data->regs);
+               header.type |= PERF_SAMPLE_IP;
+               header.size += sizeof(ip);
+       }
+
+       if (sample_type & PERF_SAMPLE_TID) {
+               /* namespace issues */
+               tid_entry.pid = perf_counter_pid(counter, current);
+               tid_entry.tid = perf_counter_tid(counter, current);
+
+               header.type |= PERF_SAMPLE_TID;
+               header.size += sizeof(tid_entry);
+       }
+
+       if (sample_type & PERF_SAMPLE_TIME) {
+               /*
+                * Maybe do better on x86 and provide cpu_clock_nmi()
+                */
+               time = sched_clock();
+
+               header.type |= PERF_SAMPLE_TIME;
+               header.size += sizeof(u64);
+       }
+
+       if (sample_type & PERF_SAMPLE_ADDR) {
+               header.type |= PERF_SAMPLE_ADDR;
+               header.size += sizeof(u64);
+       }
+
+       if (sample_type & PERF_SAMPLE_ID) {
+               header.type |= PERF_SAMPLE_ID;
+               header.size += sizeof(u64);
+       }
+
+       if (sample_type & PERF_SAMPLE_CPU) {
+               header.type |= PERF_SAMPLE_CPU;
+               header.size += sizeof(cpu_entry);
+
+               cpu_entry.cpu = raw_smp_processor_id();
+       }
+
+       if (sample_type & PERF_SAMPLE_PERIOD) {
+               header.type |= PERF_SAMPLE_PERIOD;
+               header.size += sizeof(u64);
+       }
+
+       if (sample_type & PERF_SAMPLE_GROUP) {
+               header.type |= PERF_SAMPLE_GROUP;
+               header.size += sizeof(u64) +
+                       counter->nr_siblings * sizeof(group_entry);
+       }
+
+       if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+               callchain = perf_callchain(data->regs);
+
+               if (callchain) {
+                       callchain_size = (1 + callchain->nr) * sizeof(u64);
+
+                       header.type |= PERF_SAMPLE_CALLCHAIN;
+                       header.size += callchain_size;
+               }
+       }
+
+       ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
+       if (ret)
+               return;
+
+       perf_output_put(&handle, header);
+
+       if (sample_type & PERF_SAMPLE_IP)
+               perf_output_put(&handle, ip);
+
+       if (sample_type & PERF_SAMPLE_TID)
+               perf_output_put(&handle, tid_entry);
+
+       if (sample_type & PERF_SAMPLE_TIME)
+               perf_output_put(&handle, time);
+
+       if (sample_type & PERF_SAMPLE_ADDR)
+               perf_output_put(&handle, data->addr);
+
+       if (sample_type & PERF_SAMPLE_ID)
+               perf_output_put(&handle, counter->id);
+
+       if (sample_type & PERF_SAMPLE_CPU)
+               perf_output_put(&handle, cpu_entry);
+
+       if (sample_type & PERF_SAMPLE_PERIOD)
+               perf_output_put(&handle, data->period);
+
+       /*
+        * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
+        */
+       if (sample_type & PERF_SAMPLE_GROUP) {
+               struct perf_counter *leader, *sub;
+               u64 nr = counter->nr_siblings;
+
+               perf_output_put(&handle, nr);
+
+               leader = counter->group_leader;
+               list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+                       if (sub != counter)
+                               sub->pmu->read(sub);
+
+                       group_entry.id = sub->id;
+                       group_entry.counter = atomic64_read(&sub->count);
+
+                       perf_output_put(&handle, group_entry);
+               }
+       }
+
+       if (callchain)
+               perf_output_copy(&handle, callchain, callchain_size);
+
+       perf_output_end(&handle);
+}
+
+/*
+ * fork tracking
+ */
+
+struct perf_fork_event {
+       struct task_struct      *task;
+
+       struct {
+               struct perf_event_header        header;
+
+               u32                             pid;
+               u32                             ppid;
+       } event;
+};
+
+static void perf_counter_fork_output(struct perf_counter *counter,
+                                    struct perf_fork_event *fork_event)
+{
+       struct perf_output_handle handle;
+       int size = fork_event->event.header.size;
+       struct task_struct *task = fork_event->task;
+       int ret = perf_output_begin(&handle, counter, size, 0, 0);
+
+       if (ret)
+               return;
+
+       fork_event->event.pid = perf_counter_pid(counter, task);
+       fork_event->event.ppid = perf_counter_pid(counter, task->real_parent);
+
+       perf_output_put(&handle, fork_event->event);
+       perf_output_end(&handle);
+}
+
+static int perf_counter_fork_match(struct perf_counter *counter)
+{
+       if (counter->attr.comm || counter->attr.mmap)
+               return 1;
+
+       return 0;
+}
+
+static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
+                                 struct perf_fork_event *fork_event)
+{
+       struct perf_counter *counter;
+
+       if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+               return;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+               if (perf_counter_fork_match(counter))
+                       perf_counter_fork_output(counter, fork_event);
+       }
+       rcu_read_unlock();
+}
+
+static void perf_counter_fork_event(struct perf_fork_event *fork_event)
+{
+       struct perf_cpu_context *cpuctx;
+       struct perf_counter_context *ctx;
+
+       cpuctx = &get_cpu_var(perf_cpu_context);
+       perf_counter_fork_ctx(&cpuctx->ctx, fork_event);
+       put_cpu_var(perf_cpu_context);
+
+       rcu_read_lock();
+       /*
+        * doesn't really matter which of the child contexts the
+        * events ends up in.
+        */
+       ctx = rcu_dereference(current->perf_counter_ctxp);
+       if (ctx)
+               perf_counter_fork_ctx(ctx, fork_event);
+       rcu_read_unlock();
+}
+
+void perf_counter_fork(struct task_struct *task)
+{
+       struct perf_fork_event fork_event;
+
+       if (!atomic_read(&nr_comm_counters) &&
+           !atomic_read(&nr_mmap_counters))
+               return;
+
+       fork_event = (struct perf_fork_event){
+               .task   = task,
+               .event  = {
+                       .header = {
+                               .type = PERF_EVENT_FORK,
+                               .size = sizeof(fork_event.event),
+                       },
+               },
+       };
+
+       perf_counter_fork_event(&fork_event);
+}
+
+/*
+ * comm tracking
+ */
+
+struct perf_comm_event {
+       struct task_struct      *task;
+       char                    *comm;
+       int                     comm_size;
+
+       struct {
+               struct perf_event_header        header;
+
+               u32                             pid;
+               u32                             tid;
+       } event;
+};
+
+static void perf_counter_comm_output(struct perf_counter *counter,
+                                    struct perf_comm_event *comm_event)
+{
+       struct perf_output_handle handle;
+       int size = comm_event->event.header.size;
+       int ret = perf_output_begin(&handle, counter, size, 0, 0);
+
+       if (ret)
+               return;
+
+       comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
+       comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
+
+       perf_output_put(&handle, comm_event->event);
+       perf_output_copy(&handle, comm_event->comm,
+                                  comm_event->comm_size);
+       perf_output_end(&handle);
+}
+
+static int perf_counter_comm_match(struct perf_counter *counter)
+{
+       if (counter->attr.comm)
+               return 1;
+
+       return 0;
+}
+
+static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
+                                 struct perf_comm_event *comm_event)
+{
+       struct perf_counter *counter;
+
+       if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+               return;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+               if (perf_counter_comm_match(counter))
+                       perf_counter_comm_output(counter, comm_event);
+       }
+       rcu_read_unlock();
+}
+
+static void perf_counter_comm_event(struct perf_comm_event *comm_event)
+{
+       struct perf_cpu_context *cpuctx;
+       struct perf_counter_context *ctx;
+       unsigned int size;
+       char *comm = comm_event->task->comm;
+
+       size = ALIGN(strlen(comm)+1, sizeof(u64));
+
+       comm_event->comm = comm;
+       comm_event->comm_size = size;
+
+       comm_event->event.header.size = sizeof(comm_event->event) + size;
+
+       cpuctx = &get_cpu_var(perf_cpu_context);
+       perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
+       put_cpu_var(perf_cpu_context);
+
+       rcu_read_lock();
+       /*
+        * doesn't really matter which of the child contexts the
+        * events ends up in.
+        */
+       ctx = rcu_dereference(current->perf_counter_ctxp);
+       if (ctx)
+               perf_counter_comm_ctx(ctx, comm_event);
+       rcu_read_unlock();
+}
+
+void perf_counter_comm(struct task_struct *task)
+{
+       struct perf_comm_event comm_event;
+
+       if (!atomic_read(&nr_comm_counters))
+               return;
+
+       comm_event = (struct perf_comm_event){
+               .task   = task,
+               .event  = {
+                       .header = { .type = PERF_EVENT_COMM, },
+               },
+       };
+
+       perf_counter_comm_event(&comm_event);
+}
+
+/*
+ * mmap tracking
+ */
+
+struct perf_mmap_event {
+       struct vm_area_struct   *vma;
+
+       const char              *file_name;
+       int                     file_size;
+
+       struct {
+               struct perf_event_header        header;
+
+               u32                             pid;
+               u32                             tid;
+               u64                             start;
+               u64                             len;
+               u64                             pgoff;
+       } event;
+};
+
+static void perf_counter_mmap_output(struct perf_counter *counter,
+                                    struct perf_mmap_event *mmap_event)
+{
+       struct perf_output_handle handle;
+       int size = mmap_event->event.header.size;
+       int ret = perf_output_begin(&handle, counter, size, 0, 0);
+
+       if (ret)
+               return;
+
+       mmap_event->event.pid = perf_counter_pid(counter, current);
+       mmap_event->event.tid = perf_counter_tid(counter, current);
+
+       perf_output_put(&handle, mmap_event->event);
+       perf_output_copy(&handle, mmap_event->file_name,
+                                  mmap_event->file_size);
+       perf_output_end(&handle);
+}
+
+static int perf_counter_mmap_match(struct perf_counter *counter,
+                                  struct perf_mmap_event *mmap_event)
+{
+       if (counter->attr.mmap)
+               return 1;
+
+       return 0;
+}
+
+static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
+                                 struct perf_mmap_event *mmap_event)
+{
+       struct perf_counter *counter;
+
+       if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+               return;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+               if (perf_counter_mmap_match(counter, mmap_event))
+                       perf_counter_mmap_output(counter, mmap_event);
+       }
+       rcu_read_unlock();
+}
+
+static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
+{
+       struct perf_cpu_context *cpuctx;
+       struct perf_counter_context *ctx;
+       struct vm_area_struct *vma = mmap_event->vma;
+       struct file *file = vma->vm_file;
+       unsigned int size;
+       char tmp[16];
+       char *buf = NULL;
+       const char *name;
+
+       if (file) {
+               buf = kzalloc(PATH_MAX, GFP_KERNEL);
+               if (!buf) {
+                       name = strncpy(tmp, "//enomem", sizeof(tmp));
+                       goto got_name;
+               }
+               name = d_path(&file->f_path, buf, PATH_MAX);
+               if (IS_ERR(name)) {
+                       name = strncpy(tmp, "//toolong", sizeof(tmp));
+                       goto got_name;
+               }
+       } else {
+               name = arch_vma_name(mmap_event->vma);
+               if (name)
+                       goto got_name;
+
+               if (!vma->vm_mm) {
+                       name = strncpy(tmp, "[vdso]", sizeof(tmp));
+                       goto got_name;
+               }
+
+               name = strncpy(tmp, "//anon", sizeof(tmp));
+               goto got_name;
+       }
+
+got_name:
+       size = ALIGN(strlen(name)+1, sizeof(u64));
+
+       mmap_event->file_name = name;
+       mmap_event->file_size = size;
+
+       mmap_event->event.header.size = sizeof(mmap_event->event) + size;
+
+       cpuctx = &get_cpu_var(perf_cpu_context);
+       perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
+       put_cpu_var(perf_cpu_context);
+
+       rcu_read_lock();
+       /*
+        * doesn't really matter which of the child contexts the
+        * events ends up in.
+        */
+       ctx = rcu_dereference(current->perf_counter_ctxp);
+       if (ctx)
+               perf_counter_mmap_ctx(ctx, mmap_event);
+       rcu_read_unlock();
+
+       kfree(buf);
+}
+
+void __perf_counter_mmap(struct vm_area_struct *vma)
+{
+       struct perf_mmap_event mmap_event;
+
+       if (!atomic_read(&nr_mmap_counters))
+               return;
+
+       mmap_event = (struct perf_mmap_event){
+               .vma    = vma,
+               .event  = {
+                       .header = { .type = PERF_EVENT_MMAP, },
+                       .start  = vma->vm_start,
+                       .len    = vma->vm_end - vma->vm_start,
+                       .pgoff  = vma->vm_pgoff,
+               },
+       };
+
+       perf_counter_mmap_event(&mmap_event);
+}
+
+/*
+ * Log sample_period changes so that analyzing tools can re-normalize the
+ * event flow.
+ */
+
+struct freq_event {
+       struct perf_event_header        header;
+       u64                             time;
+       u64                             id;
+       u64                             period;
+};
+
+static void perf_log_period(struct perf_counter *counter, u64 period)
+{
+       struct perf_output_handle handle;
+       struct freq_event event;
+       int ret;
+
+       if (counter->hw.sample_period == period)
+               return;
+
+       if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
+               return;
+
+       event = (struct freq_event) {
+               .header = {
+                       .type = PERF_EVENT_PERIOD,
+                       .misc = 0,
+                       .size = sizeof(event),
+               },
+               .time = sched_clock(),
+               .id = counter->id,
+               .period = period,
+       };
+
+       ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
+       if (ret)
+               return;
+
+       perf_output_put(&handle, event);
+       perf_output_end(&handle);
+}
+
+/*
+ * IRQ throttle logging
+ */
+
+static void perf_log_throttle(struct perf_counter *counter, int enable)
+{
+       struct perf_output_handle handle;
+       int ret;
+
+       struct {
+               struct perf_event_header        header;
+               u64                             time;
+               u64                             id;
+       } throttle_event = {
+               .header = {
+                       .type = PERF_EVENT_THROTTLE + 1,
+                       .misc = 0,
+                       .size = sizeof(throttle_event),
+               },
+               .time   = sched_clock(),
+               .id     = counter->id,
+       };
+
+       ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
+       if (ret)
+               return;
+
+       perf_output_put(&handle, throttle_event);
+       perf_output_end(&handle);
+}
+
+/*
+ * Generic counter overflow handling.
+ */
+
+int perf_counter_overflow(struct perf_counter *counter, int nmi,
+                         struct perf_sample_data *data)
+{
+       int events = atomic_read(&counter->event_limit);
+       int throttle = counter->pmu->unthrottle != NULL;
+       struct hw_perf_counter *hwc = &counter->hw;
+       int ret = 0;
+
+       if (!throttle) {
+               hwc->interrupts++;
+       } else {
+               if (hwc->interrupts != MAX_INTERRUPTS) {
+                       hwc->interrupts++;
+                       if (HZ * hwc->interrupts >
+                                       (u64)sysctl_perf_counter_sample_rate) {
+                               hwc->interrupts = MAX_INTERRUPTS;
+                               perf_log_throttle(counter, 0);
+                               ret = 1;
+                       }
+               } else {
+                       /*
+                        * Keep re-disabling counters even though on the previous
+                        * pass we disabled it - just in case we raced with a
+                        * sched-in and the counter got enabled again:
+                        */
+                       ret = 1;
+               }
+       }
+
+       if (counter->attr.freq) {
+               u64 now = sched_clock();
+               s64 delta = now - hwc->freq_stamp;
+
+               hwc->freq_stamp = now;
+
+               if (delta > 0 && delta < TICK_NSEC)
+                       perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
+       }
+
+       /*
+        * XXX event_limit might not quite work as expected on inherited
+        * counters
+        */
+
+       counter->pending_kill = POLL_IN;
+       if (events && atomic_dec_and_test(&counter->event_limit)) {
+               ret = 1;
+               counter->pending_kill = POLL_HUP;
+               if (nmi) {
+                       counter->pending_disable = 1;
+                       perf_pending_queue(&counter->pending,
+                                          perf_pending_counter);
+               } else
+                       perf_counter_disable(counter);
+       }
+
+       perf_counter_output(counter, nmi, data);
+       return ret;
+}
+
+/*
+ * Generic software counter infrastructure
+ */
+
+static void perf_swcounter_update(struct perf_counter *counter)
+{
+       struct hw_perf_counter *hwc = &counter->hw;
+       u64 prev, now;
+       s64 delta;
+
+again:
+       prev = atomic64_read(&hwc->prev_count);
+       now = atomic64_read(&hwc->count);
+       if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
+               goto again;
+
+       delta = now - prev;
+
+       atomic64_add(delta, &counter->count);
+       atomic64_sub(delta, &hwc->period_left);
+}
+
+static void perf_swcounter_set_period(struct perf_counter *counter)
+{
+       struct hw_perf_counter *hwc = &counter->hw;
+       s64 left = atomic64_read(&hwc->period_left);
+       s64 period = hwc->sample_period;
+
+       if (unlikely(left <= -period)) {
+               left = period;
+               atomic64_set(&hwc->period_left, left);
+               hwc->last_period = period;
+       }
+
+       if (unlikely(left <= 0)) {
+               left += period;
+               atomic64_add(period, &hwc->period_left);
+               hwc->last_period = period;
+       }
+
+       atomic64_set(&hwc->prev_count, -left);
+       atomic64_set(&hwc->count, -left);
+}
+
+static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+{
+       enum hrtimer_restart ret = HRTIMER_RESTART;
+       struct perf_sample_data data;
+       struct perf_counter *counter;
+       u64 period;
+
+       counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
+       counter->pmu->read(counter);
+
+       data.addr = 0;
+       data.regs = get_irq_regs();
+       /*
+        * In case we exclude kernel IPs or are somehow not in interrupt
+        * context, provide the next best thing, the user IP.
+        */
+       if ((counter->attr.exclude_kernel || !data.regs) &&
+                       !counter->attr.exclude_user)
+               data.regs = task_pt_regs(current);
+
+       if (data.regs) {
+               if (perf_counter_overflow(counter, 0, &data))
+                       ret = HRTIMER_NORESTART;
+       }
+
+       period = max_t(u64, 10000, counter->hw.sample_period);
+       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+       return ret;
+}
+
+static void perf_swcounter_overflow(struct perf_counter *counter,
+                                   int nmi, struct pt_regs *regs, u64 addr)
+{
+       struct perf_sample_data data = {
+               .regs   = regs,
+               .addr   = addr,
+               .period = counter->hw.last_period,
+       };
+
+       perf_swcounter_update(counter);
+       perf_swcounter_set_period(counter);
+       if (perf_counter_overflow(counter, nmi, &data))
+               /* soft-disable the counter */
+               ;
+
+}
+
+static int perf_swcounter_is_counting(struct perf_counter *counter)
+{
+       struct perf_counter_context *ctx;
+       unsigned long flags;
+       int count;
+
+       if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+               return 1;
+
+       if (counter->state != PERF_COUNTER_STATE_INACTIVE)
+               return 0;
+
+       /*
+        * If the counter is inactive, it could be just because
+        * its task is scheduled out, or because it's in a group
+        * which could not go on the PMU.  We want to count in
+        * the first case but not the second.  If the context is
+        * currently active then an inactive software counter must
+        * be the second case.  If it's not currently active then
+        * we need to know whether the counter was active when the
+        * context was last active, which we can determine by
+        * comparing counter->tstamp_stopped with ctx->time.
+        *
+        * We are within an RCU read-side critical section,
+        * which protects the existence of *ctx.
+        */
+       ctx = counter->ctx;
+       spin_lock_irqsave(&ctx->lock, flags);
+       count = 1;
+       /* Re-check state now we have the lock */
+       if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
+           counter->ctx->is_active ||
+           counter->tstamp_stopped < ctx->time)
+               count = 0;
+       spin_unlock_irqrestore(&ctx->lock, flags);
+       return count;
+}
+
+static int perf_swcounter_match(struct perf_counter *counter,
+                               enum perf_type_id type,
+                               u32 event, struct pt_regs *regs)
+{
+       if (!perf_swcounter_is_counting(counter))
+               return 0;
+
+       if (counter->attr.type != type)
+               return 0;
+       if (counter->attr.config != event)
+               return 0;
+
+       if (regs) {
+               if (counter->attr.exclude_user && user_mode(regs))
+                       return 0;
+
+               if (counter->attr.exclude_kernel && !user_mode(regs))
+                       return 0;
+       }
+
+       return 1;
+}
+
+static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
+                              int nmi, struct pt_regs *regs, u64 addr)
+{
+       int neg = atomic64_add_negative(nr, &counter->hw.count);
+
+       if (counter->hw.sample_period && !neg && regs)
+               perf_swcounter_overflow(counter, nmi, regs, addr);
+}
+
+static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
+                                    enum perf_type_id type, u32 event,
+                                    u64 nr, int nmi, struct pt_regs *regs,
+                                    u64 addr)
+{
+       struct perf_counter *counter;
+
+       if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+               return;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+               if (perf_swcounter_match(counter, type, event, regs))
+                       perf_swcounter_add(counter, nr, nmi, regs, addr);
+       }
+       rcu_read_unlock();
+}
+
+static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
+{
+       if (in_nmi())
+               return &cpuctx->recursion[3];
+
+       if (in_irq())
+               return &cpuctx->recursion[2];
+
+       if (in_softirq())
+               return &cpuctx->recursion[1];
+
+       return &cpuctx->recursion[0];
+}
+
+static void __perf_swcounter_event(enum perf_type_id type, u32 event,
+                                  u64 nr, int nmi, struct pt_regs *regs,
+                                  u64 addr)
+{
+       struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+       int *recursion = perf_swcounter_recursion_context(cpuctx);
+       struct perf_counter_context *ctx;
+
+       if (*recursion)
+               goto out;
+
+       (*recursion)++;
+       barrier();
+
+       perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
+                                nr, nmi, regs, addr);
+       rcu_read_lock();
+       /*
+        * doesn't really matter which of the child contexts the
+        * events ends up in.
+        */
+       ctx = rcu_dereference(current->perf_counter_ctxp);
+       if (ctx)
+               perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr);
+       rcu_read_unlock();
+
+       barrier();
+       (*recursion)--;
+
+out:
+       put_cpu_var(perf_cpu_context);
+}
+
+void
+perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
+{
+       __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr);
+}
+
+static void perf_swcounter_read(struct perf_counter *counter)
+{
+       perf_swcounter_update(counter);
+}
+
+static int perf_swcounter_enable(struct perf_counter *counter)
+{
+       perf_swcounter_set_period(counter);
+       return 0;
+}
+
+static void perf_swcounter_disable(struct perf_counter *counter)
+{
+       perf_swcounter_update(counter);
+}
+
+static const struct pmu perf_ops_generic = {
+       .enable         = perf_swcounter_enable,
+       .disable        = perf_swcounter_disable,
+       .read           = perf_swcounter_read,
+};
+
+/*
+ * Software counter: cpu wall time clock
+ */
+
+static void cpu_clock_perf_counter_update(struct perf_counter *counter)
+{
+       int cpu = raw_smp_processor_id();
+       s64 prev;
+       u64 now;
+
+       now = cpu_clock(cpu);
+       prev = atomic64_read(&counter->hw.prev_count);
+       atomic64_set(&counter->hw.prev_count, now);
+       atomic64_add(now - prev, &counter->count);
+}
+
+static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
+{
+       struct hw_perf_counter *hwc = &counter->hw;
+       int cpu = raw_smp_processor_id();
+
+       atomic64_set(&hwc->prev_count, cpu_clock(cpu));
+       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hwc->hrtimer.function = perf_swcounter_hrtimer;
+       if (hwc->sample_period) {
+               u64 period = max_t(u64, 10000, hwc->sample_period);
+               __hrtimer_start_range_ns(&hwc->hrtimer,
+                               ns_to_ktime(period), 0,
+                               HRTIMER_MODE_REL, 0);
+       }
+
+       return 0;
+}
+
+static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
+{
+       if (counter->hw.sample_period)
+               hrtimer_cancel(&counter->hw.hrtimer);
+       cpu_clock_perf_counter_update(counter);
+}
+
+static void cpu_clock_perf_counter_read(struct perf_counter *counter)
+{
+       cpu_clock_perf_counter_update(counter);
+}
+
+static const struct pmu perf_ops_cpu_clock = {
+       .enable         = cpu_clock_perf_counter_enable,
+       .disable        = cpu_clock_perf_counter_disable,
+       .read           = cpu_clock_perf_counter_read,
+};
+
+/*
+ * Software counter: task time clock
+ */
+
+static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
+{
+       u64 prev;
+       s64 delta;
+
+       prev = atomic64_xchg(&counter->hw.prev_count, now);
+       delta = now - prev;
+       atomic64_add(delta, &counter->count);
+}
+
+static int task_clock_perf_counter_enable(struct perf_counter *counter)
+{
+       struct hw_perf_counter *hwc = &counter->hw;
+       u64 now;
+
+       now = counter->ctx->time;
+
+       atomic64_set(&hwc->prev_count, now);
+       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hwc->hrtimer.function = perf_swcounter_hrtimer;
+       if (hwc->sample_period) {
+               u64 period = max_t(u64, 10000, hwc->sample_period);
+               __hrtimer_start_range_ns(&hwc->hrtimer,
+                               ns_to_ktime(period), 0,
+                               HRTIMER_MODE_REL, 0);
+       }
+
+       return 0;
+}
+
+static void task_clock_perf_counter_disable(struct perf_counter *counter)
+{
+       if (counter->hw.sample_period)
+               hrtimer_cancel(&counter->hw.hrtimer);
+       task_clock_perf_counter_update(counter, counter->ctx->time);
+
+}
+
+static void task_clock_perf_counter_read(struct perf_counter *counter)
+{
+       u64 time;
+
+       if (!in_nmi()) {
+               update_context_time(counter->ctx);
+               time = counter->ctx->time;
+       } else {
+               u64 now = perf_clock();
+               u64 delta = now - counter->ctx->timestamp;
+               time = counter->ctx->time + delta;
+       }
+
+       task_clock_perf_counter_update(counter, time);
+}
+
+static const struct pmu perf_ops_task_clock = {
+       .enable         = task_clock_perf_counter_enable,
+       .disable        = task_clock_perf_counter_disable,
+       .read           = task_clock_perf_counter_read,
+};
+
+/*
+ * Software counter: cpu migrations
+ */
+void perf_counter_task_migration(struct task_struct *task, int cpu)
+{
+       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct perf_counter_context *ctx;
+
+       perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
+                                PERF_COUNT_SW_CPU_MIGRATIONS,
+                                1, 1, NULL, 0);
+
+       ctx = perf_pin_task_context(task);
+       if (ctx) {
+               perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
+                                        PERF_COUNT_SW_CPU_MIGRATIONS,
+                                        1, 1, NULL, 0);
+               perf_unpin_context(ctx);
+       }
+}
+
+#ifdef CONFIG_EVENT_PROFILE
+void perf_tpcounter_event(int event_id)
+{
+       struct pt_regs *regs = get_irq_regs();
+
+       if (!regs)
+               regs = task_pt_regs(current);
+
+       __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
+}
+EXPORT_SYMBOL_GPL(perf_tpcounter_event);
+
+extern int ftrace_profile_enable(int);
+extern void ftrace_profile_disable(int);
+
+static void tp_perf_counter_destroy(struct perf_counter *counter)
+{
+       ftrace_profile_disable(perf_event_id(&counter->attr));
+}
+
+static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
+{
+       int event_id = perf_event_id(&counter->attr);
+       int ret;
+
+       ret = ftrace_profile_enable(event_id);
+       if (ret)
+               return NULL;
+
+       counter->destroy = tp_perf_counter_destroy;
+
+       return &perf_ops_generic;
+}
+#else
+static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
+{
+       return NULL;
+}
+#endif
+
+static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
+{
+       const struct pmu *pmu = NULL;
+
+       /*
+        * Software counters (currently) can't in general distinguish
+        * between user, kernel and hypervisor events.
+        * However, context switches and cpu migrations are considered
+        * to be kernel events, and page faults are never hypervisor
+        * events.
+        */
+       switch (counter->attr.config) {
+       case PERF_COUNT_SW_CPU_CLOCK:
+               pmu = &perf_ops_cpu_clock;
+
+               break;
+       case PERF_COUNT_SW_TASK_CLOCK:
+               /*
+                * If the user instantiates this as a per-cpu counter,
+                * use the cpu_clock counter instead.
+                */
+               if (counter->ctx->task)
+                       pmu = &perf_ops_task_clock;
+               else
+                       pmu = &perf_ops_cpu_clock;
+
+               break;
+       case PERF_COUNT_SW_PAGE_FAULTS:
+       case PERF_COUNT_SW_PAGE_FAULTS_MIN:
+       case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
+       case PERF_COUNT_SW_CONTEXT_SWITCHES:
+       case PERF_COUNT_SW_CPU_MIGRATIONS:
+               pmu = &perf_ops_generic;
+               break;
+       }
+
+       return pmu;
+}
+
+/*
+ * Allocate and initialize a counter structure
+ */
+static struct perf_counter *
+perf_counter_alloc(struct perf_counter_attr *attr,
+                  int cpu,
+                  struct perf_counter_context *ctx,
+                  struct perf_counter *group_leader,
+                  gfp_t gfpflags)
+{
+       const struct pmu *pmu;
+       struct perf_counter *counter;
+       struct hw_perf_counter *hwc;
+       long err;
+
+       counter = kzalloc(sizeof(*counter), gfpflags);
+       if (!counter)
+               return ERR_PTR(-ENOMEM);
+
+       /*
+        * Single counters are their own group leaders, with an
+        * empty sibling list:
+        */
+       if (!group_leader)
+               group_leader = counter;
+
+       mutex_init(&counter->child_mutex);
+       INIT_LIST_HEAD(&counter->child_list);
+
+       INIT_LIST_HEAD(&counter->list_entry);
+       INIT_LIST_HEAD(&counter->event_entry);
+       INIT_LIST_HEAD(&counter->sibling_list);
+       init_waitqueue_head(&counter->waitq);
+
+       mutex_init(&counter->mmap_mutex);
+
+       counter->cpu            = cpu;
+       counter->attr           = *attr;
+       counter->group_leader   = group_leader;
+       counter->pmu            = NULL;
+       counter->ctx            = ctx;
+       counter->oncpu          = -1;
+
+       counter->ns             = get_pid_ns(current->nsproxy->pid_ns);
+       counter->id             = atomic64_inc_return(&perf_counter_id);
+
+       counter->state          = PERF_COUNTER_STATE_INACTIVE;
+
+       if (attr->disabled)
+               counter->state = PERF_COUNTER_STATE_OFF;
+
+       pmu = NULL;
+
+       hwc = &counter->hw;
+       hwc->sample_period = attr->sample_period;
+       if (attr->freq && attr->sample_freq)
+               hwc->sample_period = 1;
+
+       atomic64_set(&hwc->period_left, hwc->sample_period);
+
+       /*
+        * we currently do not support PERF_SAMPLE_GROUP on inherited counters
+        */
+       if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
+               goto done;
+
+       if (attr->type == PERF_TYPE_RAW) {
+               pmu = hw_perf_counter_init(counter);
+               goto done;
+       }
+
+       switch (attr->type) {
+       case PERF_TYPE_HARDWARE:
+       case PERF_TYPE_HW_CACHE:
+               pmu = hw_perf_counter_init(counter);
+               break;
+
+       case PERF_TYPE_SOFTWARE:
+               pmu = sw_perf_counter_init(counter);
+               break;
+
+       case PERF_TYPE_TRACEPOINT:
+               pmu = tp_perf_counter_init(counter);
+               break;
+       }
+done:
+       err = 0;
+       if (!pmu)
+               err = -EINVAL;
+       else if (IS_ERR(pmu))
+               err = PTR_ERR(pmu);
+
+       if (err) {
+               if (counter->ns)
+                       put_pid_ns(counter->ns);
+               kfree(counter);
+               return ERR_PTR(err);
+       }
+
+       counter->pmu = pmu;
+
+       atomic_inc(&nr_counters);
+       if (counter->attr.mmap)
+               atomic_inc(&nr_mmap_counters);
+       if (counter->attr.comm)
+               atomic_inc(&nr_comm_counters);
+
+       return counter;
+}
+
+/**
+ * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
+ *
+ * @attr_uptr: event type attributes for monitoring/sampling
+ * @pid:               target pid
+ * @cpu:               target cpu
+ * @group_fd:          group leader counter fd
+ */
+SYSCALL_DEFINE5(perf_counter_open,
+               const struct perf_counter_attr __user *, attr_uptr,
+               pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
+{
+       struct perf_counter *counter, *group_leader;
+       struct perf_counter_attr attr;
+       struct perf_counter_context *ctx;
+       struct file *counter_file = NULL;
+       struct file *group_file = NULL;
+       int fput_needed = 0;
+       int fput_needed2 = 0;
+       int ret;
+
+       /* for future expandability... */
+       if (flags)
+               return -EINVAL;
+
+       if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0)
+               return -EFAULT;
+
+       if (!attr.exclude_kernel) {
+               if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+                       return -EACCES;
+       }
+
+       if (attr.freq) {
+               if (attr.sample_freq > sysctl_perf_counter_sample_rate)
+                       return -EINVAL;
+       }
+
+       /*
+        * Get the target context (task or percpu):
+        */
+       ctx = find_get_context(pid, cpu);
+       if (IS_ERR(ctx))
+               return PTR_ERR(ctx);
+
+       /*
+        * Look up the group leader (we will attach this counter to it):
+        */
+       group_leader = NULL;
+       if (group_fd != -1) {
+               ret = -EINVAL;
+               group_file = fget_light(group_fd, &fput_needed);
+               if (!group_file)
+                       goto err_put_context;
+               if (group_file->f_op != &perf_fops)
+                       goto err_put_context;
+
+               group_leader = group_file->private_data;
+               /*
+                * Do not allow a recursive hierarchy (this new sibling
+                * becoming part of another group-sibling):
+                */
+               if (group_leader->group_leader != group_leader)
+                       goto err_put_context;
+               /*
+                * Do not allow to attach to a group in a different
+                * task or CPU context:
+                */
+               if (group_leader->ctx != ctx)
+                       goto err_put_context;
+               /*
+                * Only a group leader can be exclusive or pinned
+                */
+               if (attr.exclusive || attr.pinned)
+                       goto err_put_context;
+       }
+
+       counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
+                                    GFP_KERNEL);
+       ret = PTR_ERR(counter);
+       if (IS_ERR(counter))
+               goto err_put_context;
+
+       ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
+       if (ret < 0)
+               goto err_free_put_context;
+
+       counter_file = fget_light(ret, &fput_needed2);
+       if (!counter_file)
+               goto err_free_put_context;
+
+       counter->filp = counter_file;
+       WARN_ON_ONCE(ctx->parent_ctx);
+       mutex_lock(&ctx->mutex);
+       perf_install_in_context(ctx, counter, cpu);
+       ++ctx->generation;
+       mutex_unlock(&ctx->mutex);
+
+       counter->owner = current;
+       get_task_struct(current);
+       mutex_lock(&current->perf_counter_mutex);
+       list_add_tail(&counter->owner_entry, &current->perf_counter_list);
+       mutex_unlock(&current->perf_counter_mutex);
+
+       fput_light(counter_file, fput_needed2);
+
+out_fput:
+       fput_light(group_file, fput_needed);
+
+       return ret;
+
+err_free_put_context:
+       kfree(counter);
+
+err_put_context:
+       put_ctx(ctx);
+
+       goto out_fput;
+}
+
+/*
+ * inherit a counter from parent task to child task:
+ */
+static struct perf_counter *
+inherit_counter(struct perf_counter *parent_counter,
+             struct task_struct *parent,
+             struct perf_counter_context *parent_ctx,
+             struct task_struct *child,
+             struct perf_counter *group_leader,
+             struct perf_counter_context *child_ctx)
+{
+       struct perf_counter *child_counter;
+
+       /*
+        * Instead of creating recursive hierarchies of counters,
+        * we link inherited counters back to the original parent,
+        * which has a filp for sure, which we use as the reference
+        * count:
+        */
+       if (parent_counter->parent)
+               parent_counter = parent_counter->parent;
+
+       child_counter = perf_counter_alloc(&parent_counter->attr,
+                                          parent_counter->cpu, child_ctx,
+                                          group_leader, GFP_KERNEL);
+       if (IS_ERR(child_counter))
+               return child_counter;
+       get_ctx(child_ctx);
+
+       /*
+        * Make the child state follow the state of the parent counter,
+        * not its attr.disabled bit.  We hold the parent's mutex,
+        * so we won't race with perf_counter_{en, dis}able_family.
+        */
+       if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
+               child_counter->state = PERF_COUNTER_STATE_INACTIVE;
+       else
+               child_counter->state = PERF_COUNTER_STATE_OFF;
+
+       if (parent_counter->attr.freq)
+               child_counter->hw.sample_period = parent_counter->hw.sample_period;
+
+       /*
+        * Link it up in the child's context:
+        */
+       add_counter_to_ctx(child_counter, child_ctx);
+
+       child_counter->parent = parent_counter;
+       /*
+        * inherit into child's child as well:
+        */
+       child_counter->attr.inherit = 1;
+
+       /*
+        * Get a reference to the parent filp - we will fput it
+        * when the child counter exits. This is safe to do because
+        * we are in the parent and we know that the filp still
+        * exists and has a nonzero count:
+        */
+       atomic_long_inc(&parent_counter->filp->f_count);
+
+       /*
+        * Link this into the parent counter's child list
+        */
+       WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
+       mutex_lock(&parent_counter->child_mutex);
+       list_add_tail(&child_counter->child_list, &parent_counter->child_list);
+       mutex_unlock(&parent_counter->child_mutex);
+
+       return child_counter;
+}
+
+static int inherit_group(struct perf_counter *parent_counter,
+             struct task_struct *parent,
+             struct perf_counter_context *parent_ctx,
+             struct task_struct *child,
+             struct perf_counter_context *child_ctx)
+{
+       struct perf_counter *leader;
+       struct perf_counter *sub;
+       struct perf_counter *child_ctr;
+
+       leader = inherit_counter(parent_counter, parent, parent_ctx,
+                                child, NULL, child_ctx);
+       if (IS_ERR(leader))
+               return PTR_ERR(leader);
+       list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
+               child_ctr = inherit_counter(sub, parent, parent_ctx,
+                                           child, leader, child_ctx);
+               if (IS_ERR(child_ctr))
+                       return PTR_ERR(child_ctr);
+       }
+       return 0;
+}
+
+static void sync_child_counter(struct perf_counter *child_counter,
+                              struct perf_counter *parent_counter)
+{
+       u64 child_val;
+
+       child_val = atomic64_read(&child_counter->count);
+
+       /*
+        * Add back the child's count to the parent's count:
+        */
+       atomic64_add(child_val, &parent_counter->count);
+       atomic64_add(child_counter->total_time_enabled,
+                    &parent_counter->child_total_time_enabled);
+       atomic64_add(child_counter->total_time_running,
+                    &parent_counter->child_total_time_running);
+
+       /*
+        * Remove this counter from the parent's list
+        */
+       WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
+       mutex_lock(&parent_counter->child_mutex);
+       list_del_init(&child_counter->child_list);
+       mutex_unlock(&parent_counter->child_mutex);
+
+       /*
+        * Release the parent counter, if this was the last
+        * reference to it.
+        */
+       fput(parent_counter->filp);
+}
+
+static void
+__perf_counter_exit_task(struct perf_counter *child_counter,
+                        struct perf_counter_context *child_ctx)
+{
+       struct perf_counter *parent_counter;
+
+       update_counter_times(child_counter);
+       perf_counter_remove_from_context(child_counter);
+
+       parent_counter = child_counter->parent;
+       /*
+        * It can happen that parent exits first, and has counters
+        * that are still around due to the child reference. These
+        * counters need to be zapped - but otherwise linger.
+        */
+       if (parent_counter) {
+               sync_child_counter(child_counter, parent_counter);
+               free_counter(child_counter);
+       }
+}
+
+/*
+ * When a child task exits, feed back counter values to parent counters.
+ */
+void perf_counter_exit_task(struct task_struct *child)
+{
+       struct perf_counter *child_counter, *tmp;
+       struct perf_counter_context *child_ctx;
+       unsigned long flags;
+
+       if (likely(!child->perf_counter_ctxp))
+               return;
+
+       local_irq_save(flags);
+       /*
+        * We can't reschedule here because interrupts are disabled,
+        * and either child is current or it is a task that can't be
+        * scheduled, so we are now safe from rescheduling changing
+        * our context.
+        */
+       child_ctx = child->perf_counter_ctxp;
+       __perf_counter_task_sched_out(child_ctx);
+
+       /*
+        * Take the context lock here so that if find_get_context is
+        * reading child->perf_counter_ctxp, we wait until it has
+        * incremented the context's refcount before we do put_ctx below.
+        */
+       spin_lock(&child_ctx->lock);
+       child->perf_counter_ctxp = NULL;
+       if (child_ctx->parent_ctx) {
+               /*
+                * This context is a clone; unclone it so it can't get
+                * swapped to another process while we're removing all
+                * the counters from it.
+                */
+               put_ctx(child_ctx->parent_ctx);
+               child_ctx->parent_ctx = NULL;
+       }
+       spin_unlock(&child_ctx->lock);
+       local_irq_restore(flags);
+
+       /*
+        * We can recurse on the same lock type through:
+        *
+        *   __perf_counter_exit_task()
+        *     sync_child_counter()
+        *       fput(parent_counter->filp)
+        *         perf_release()
+        *           mutex_lock(&ctx->mutex)
+        *
+        * But since its the parent context it won't be the same instance.
+        */
+       mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
+
+again:
+       list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
+                                list_entry)
+               __perf_counter_exit_task(child_counter, child_ctx);
+
+       /*
+        * If the last counter was a group counter, it will have appended all
+        * its siblings to the list, but we obtained 'tmp' before that which
+        * will still point to the list head terminating the iteration.
+        */
+       if (!list_empty(&child_ctx->counter_list))
+               goto again;
+
+       mutex_unlock(&child_ctx->mutex);
+
+       put_ctx(child_ctx);
+}
+
+/*
+ * free an unexposed, unused context as created by inheritance by
+ * init_task below, used by fork() in case of fail.
+ */
+void perf_counter_free_task(struct task_struct *task)
+{
+       struct perf_counter_context *ctx = task->perf_counter_ctxp;
+       struct perf_counter *counter, *tmp;
+
+       if (!ctx)
+               return;
+
+       mutex_lock(&ctx->mutex);
+again:
+       list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
+               struct perf_counter *parent = counter->parent;
+
+               if (WARN_ON_ONCE(!parent))
+                       continue;
+
+               mutex_lock(&parent->child_mutex);
+               list_del_init(&counter->child_list);
+               mutex_unlock(&parent->child_mutex);
+
+               fput(parent->filp);
+
+               list_del_counter(counter, ctx);
+               free_counter(counter);
+       }
+
+       if (!list_empty(&ctx->counter_list))
+               goto again;
+
+       mutex_unlock(&ctx->mutex);
+
+       put_ctx(ctx);
+}
+
+/*
+ * Initialize the perf_counter context in task_struct
+ */
+int perf_counter_init_task(struct task_struct *child)
+{
+       struct perf_counter_context *child_ctx, *parent_ctx;
+       struct perf_counter_context *cloned_ctx;
+       struct perf_counter *counter;
+       struct task_struct *parent = current;
+       int inherited_all = 1;
+       int ret = 0;
+
+       child->perf_counter_ctxp = NULL;
+
+       mutex_init(&child->perf_counter_mutex);
+       INIT_LIST_HEAD(&child->perf_counter_list);
+
+       if (likely(!parent->perf_counter_ctxp))
+               return 0;
+
+       /*
+        * This is executed from the parent task context, so inherit
+        * counters that have been marked for cloning.
+        * First allocate and initialize a context for the child.
+        */
+
+       child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
+       if (!child_ctx)
+               return -ENOMEM;
+
+       __perf_counter_init_context(child_ctx, child);
+       child->perf_counter_ctxp = child_ctx;
+       get_task_struct(child);
+
+       /*
+        * If the parent's context is a clone, pin it so it won't get
+        * swapped under us.
+        */
+       parent_ctx = perf_pin_task_context(parent);
+
+       /*
+        * No need to check if parent_ctx != NULL here; since we saw
+        * it non-NULL earlier, the only reason for it to become NULL
+        * is if we exit, and since we're currently in the middle of
+        * a fork we can't be exiting at the same time.
+        */
+
+       /*
+        * Lock the parent list. No need to lock the child - not PID
+        * hashed yet and not running, so nobody can access it.
+        */
+       mutex_lock(&parent_ctx->mutex);
+
+       /*
+        * We dont have to disable NMIs - we are only looking at
+        * the list, not manipulating it:
+        */
+       list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
+               if (counter != counter->group_leader)
+                       continue;
+
+               if (!counter->attr.inherit) {
+                       inherited_all = 0;
+                       continue;
+               }
+
+               ret = inherit_group(counter, parent, parent_ctx,
+                                            child, child_ctx);
+               if (ret) {
+                       inherited_all = 0;
+                       break;
+               }
+       }
+
+       if (inherited_all) {
+               /*
+                * Mark the child context as a clone of the parent
+                * context, or of whatever the parent is a clone of.
+                * Note that if the parent is a clone, it could get
+                * uncloned at any point, but that doesn't matter
+                * because the list of counters and the generation
+                * count can't have changed since we took the mutex.
+                */
+               cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
+               if (cloned_ctx) {
+                       child_ctx->parent_ctx = cloned_ctx;
+                       child_ctx->parent_gen = parent_ctx->parent_gen;
+               } else {
+                       child_ctx->parent_ctx = parent_ctx;
+                       child_ctx->parent_gen = parent_ctx->generation;
+               }
+               get_ctx(child_ctx->parent_ctx);
+       }
+
+       mutex_unlock(&parent_ctx->mutex);
+
+       perf_unpin_context(parent_ctx);
+
+       return ret;
+}
+
+static void __cpuinit perf_counter_init_cpu(int cpu)
+{
+       struct perf_cpu_context *cpuctx;
+
+       cpuctx = &per_cpu(perf_cpu_context, cpu);
+       __perf_counter_init_context(&cpuctx->ctx, NULL);
+
+       spin_lock(&perf_resource_lock);
+       cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
+       spin_unlock(&perf_resource_lock);
+
+       hw_perf_counter_setup(cpu);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void __perf_counter_exit_cpu(void *info)
+{
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       struct perf_counter_context *ctx = &cpuctx->ctx;
+       struct perf_counter *counter, *tmp;
+
+       list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
+               __perf_counter_remove_from_context(counter);
+}
+static void perf_counter_exit_cpu(int cpu)
+{
+       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct perf_counter_context *ctx = &cpuctx->ctx;
+
+       mutex_lock(&ctx->mutex);
+       smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
+       mutex_unlock(&ctx->mutex);
+}
+#else
+static inline void perf_counter_exit_cpu(int cpu) { }
+#endif
+
+static int __cpuinit
+perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+
+       switch (action) {
+
+       case CPU_UP_PREPARE:
+       case CPU_UP_PREPARE_FROZEN:
+               perf_counter_init_cpu(cpu);
+               break;
+
+       case CPU_DOWN_PREPARE:
+       case CPU_DOWN_PREPARE_FROZEN:
+               perf_counter_exit_cpu(cpu);
+               break;
+
+       default:
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+/*
+ * This has to have a higher priority than migration_notifier in sched.c.
+ */
+static struct notifier_block __cpuinitdata perf_cpu_nb = {
+       .notifier_call          = perf_cpu_notify,
+       .priority               = 20,
+};
+
+void __init perf_counter_init(void)
+{
+       perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
+                       (void *)(long)smp_processor_id());
+       register_cpu_notifier(&perf_cpu_nb);
+}
+
+static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
+{
+       return sprintf(buf, "%d\n", perf_reserved_percpu);
+}
+
+static ssize_t
+perf_set_reserve_percpu(struct sysdev_class *class,
+                       const char *buf,
+                       size_t count)
+{
+       struct perf_cpu_context *cpuctx;
+       unsigned long val;
+       int err, cpu, mpt;
+
+       err = strict_strtoul(buf, 10, &val);
+       if (err)
+               return err;
+       if (val > perf_max_counters)
+               return -EINVAL;
+
+       spin_lock(&perf_resource_lock);
+       perf_reserved_percpu = val;
+       for_each_online_cpu(cpu) {
+               cpuctx = &per_cpu(perf_cpu_context, cpu);
+               spin_lock_irq(&cpuctx->ctx.lock);
+               mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
+                         perf_max_counters - perf_reserved_percpu);
+               cpuctx->max_pertask = mpt;
+               spin_unlock_irq(&cpuctx->ctx.lock);
+       }
+       spin_unlock(&perf_resource_lock);
+
+       return count;
+}
+
+static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
+{
+       return sprintf(buf, "%d\n", perf_overcommit);
+}
+
+static ssize_t
+perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
+{
+       unsigned long val;
+       int err;
+
+       err = strict_strtoul(buf, 10, &val);
+       if (err)
+               return err;
+       if (val > 1)
+               return -EINVAL;
+
+       spin_lock(&perf_resource_lock);
+       perf_overcommit = val;
+       spin_unlock(&perf_resource_lock);
+
+       return count;
+}
+
+static SYSDEV_CLASS_ATTR(
+                               reserve_percpu,
+                               0644,
+                               perf_show_reserve_percpu,
+                               perf_set_reserve_percpu
+                       );
+
+static SYSDEV_CLASS_ATTR(
+                               overcommit,
+                               0644,
+                               perf_show_overcommit,
+                               perf_set_overcommit
+                       );
+
+static struct attribute *perfclass_attrs[] = {
+       &attr_reserve_percpu.attr,
+       &attr_overcommit.attr,
+       NULL
+};
+
+static struct attribute_group perfclass_attr_group = {
+       .attrs                  = perfclass_attrs,
+       .name                   = "perf_counters",
+};
+
+static int __init perf_counter_sysfs_init(void)
+{
+       return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
+                                 &perfclass_attr_group);
+}
+device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c

index 42c317874cfa8d2703c13f3c44757a6f397459ac..2442d140bd9a3eb3a6d9dcef46c406befb736115 100644 (file)
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -24,16 +24,6 @@
  #include <linux/uaccess.h>
  
  
-/*
- * Initialize a new task whose father had been ptraced.
- *
- * Called from copy_process().
- */
-void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
-{
-       arch_ptrace_fork(child, clone_flags);
-}
-
  /*
   * ptrace a task: make the debugger its new parent and
   * move it to the ptrace list.
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c

index ce97a4df64d3539edc785ff9db119c3ba3edbe07..beb0e659adcc60be60bcc1efd2c6e9ca45367afe 100644 (file)
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1356,17 +1356,11 @@ static int rcu_sched_grace_period(void *arg)
  
                 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
                 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
-               ret = 0;
+               ret = 0; /* unused */
                 __wait_event_interruptible(rcu_ctrlblk.sched_wq,
                         rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
                         ret);
  
-               /*
-                * Signals would prevent us from sleeping, and we cannot
-                * do much with them in any case.  So flush them.
-                */
-               if (ret)
-                       flush_signals(current);
                 couldsleepnext = 0;
  
         } while (!kthread_should_stop());
diff --git a/kernel/rcutree.c b/kernel/rcutree.c

index d2a372fb0b9b511cfe9ebdc1f5431f2ec5faafcd..0dccfbba6d267ad59b62314d665a3a91b3104252 100644 (file)
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1259,31 +1259,44 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
         check_cpu_stall(rsp, rdp);
  
         /* Is the RCU core waiting for a quiescent state from this CPU? */
-       if (rdp->qs_pending)
+       if (rdp->qs_pending) {
+               rdp->n_rp_qs_pending++;
                 return 1;
+       }
  
         /* Does this CPU have callbacks ready to invoke? */
-       if (cpu_has_callbacks_ready_to_invoke(rdp))
+       if (cpu_has_callbacks_ready_to_invoke(rdp)) {
+               rdp->n_rp_cb_ready++;
                 return 1;
+       }
  
         /* Has RCU gone idle with this CPU needing another grace period? */
-       if (cpu_needs_another_gp(rsp, rdp))
+       if (cpu_needs_another_gp(rsp, rdp)) {
+               rdp->n_rp_cpu_needs_gp++;
                 return 1;
+       }
  
         /* Has another RCU grace period completed?  */
-       if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
+       if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */
+               rdp->n_rp_gp_completed++;
                 return 1;
+       }
  
         /* Has a new RCU grace period started? */
-       if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
+       if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */
+               rdp->n_rp_gp_started++;
                 return 1;
+       }
  
         /* Has an RCU GP gone long enough to send resched IPIs &c? */
         if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
-           ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0))
+           ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
+               rdp->n_rp_need_fqs++;
                 return 1;
+       }
  
         /* nothing to do */
+       rdp->n_rp_need_nothing++;
         return 0;
  }
  
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c

index 4b1875ba94044216250d938713ddf76ac2280f1c..fe1dcdbf1ca340558191a02ee408fea17e04fadd 100644 (file)
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -213,7 +213,63 @@ static struct file_operations rcugp_fops = {
         .release = single_release,
  };
  
-static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir;
+static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
+{
+       seq_printf(m, "%3d%cnp=%ld "
+                  "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n",
+                  rdp->cpu,
+                  cpu_is_offline(rdp->cpu) ? '!' : ' ',
+                  rdp->n_rcu_pending,
+                  rdp->n_rp_qs_pending,
+                  rdp->n_rp_cb_ready,
+                  rdp->n_rp_cpu_needs_gp,
+                  rdp->n_rp_gp_completed,
+                  rdp->n_rp_gp_started,
+                  rdp->n_rp_need_fqs,
+                  rdp->n_rp_need_nothing);
+}
+
+static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
+{
+       int cpu;
+       struct rcu_data *rdp;
+
+       for_each_possible_cpu(cpu) {
+               rdp = rsp->rda[cpu];
+               if (rdp->beenonline)
+                       print_one_rcu_pending(m, rdp);
+       }
+}
+
+static int show_rcu_pending(struct seq_file *m, void *unused)
+{
+       seq_puts(m, "rcu:\n");
+       print_rcu_pendings(m, &rcu_state);
+       seq_puts(m, "rcu_bh:\n");
+       print_rcu_pendings(m, &rcu_bh_state);
+       return 0;
+}
+
+static int rcu_pending_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, show_rcu_pending, NULL);
+}
+
+static struct file_operations rcu_pending_fops = {
+       .owner = THIS_MODULE,
+       .open = rcu_pending_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = single_release,
+};
+
+static struct dentry *rcudir;
+static struct dentry *datadir;
+static struct dentry *datadir_csv;
+static struct dentry *gpdir;
+static struct dentry *hierdir;
+static struct dentry *rcu_pendingdir;
+
  static int __init rcuclassic_trace_init(void)
  {
         rcudir = debugfs_create_dir("rcu", NULL);
@@ -238,6 +294,11 @@ static int __init rcuclassic_trace_init(void)
                                                 NULL, &rcuhier_fops);
         if (!hierdir)
                 goto free_out;
+
+       rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir,
+                                               NULL, &rcu_pending_fops);
+       if (!rcu_pendingdir)
+               goto free_out;
         return 0;
  free_out:
         if (datadir)
@@ -257,6 +318,7 @@ static void __exit rcuclassic_trace_cleanup(void)
         debugfs_remove(datadir_csv);
         debugfs_remove(gpdir);
         debugfs_remove(hierdir);
+       debugfs_remove(rcu_pendingdir);
         debugfs_remove(rcudir);
  }
  
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c

index 69d9cb921ffa657ef6939be6dcfb053570fb019f..820c5af44f3ec6472db64bf32daeccba458ab7d7 100644 (file)
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
   * assigned pending owner [which might not have taken the
   * lock yet]:
   */
-static inline int try_to_steal_lock(struct rt_mutex *lock)
+static inline int try_to_steal_lock(struct rt_mutex *lock,
+                                   struct task_struct *task)
  {
         struct task_struct *pendowner = rt_mutex_owner(lock);
         struct rt_mutex_waiter *next;
@@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
         if (!rt_mutex_owner_pending(lock))
                 return 0;
  
-       if (pendowner == current)
+       if (pendowner == task)
                 return 1;
  
         spin_lock_irqsave(&pendowner->pi_lock, flags);
-       if (current->prio >= pendowner->prio) {
+       if (task->prio >= pendowner->prio) {
                 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
                 return 0;
         }
@@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
          * We are going to steal the lock and a waiter was
          * enqueued on the pending owners pi_waiters queue. So
          * we have to enqueue this waiter into
-        * current->pi_waiters list. This covers the case,
-        * where current is boosted because it holds another
+        * task->pi_waiters list. This covers the case,
+        * where task is boosted because it holds another
          * lock and gets unboosted because the booster is
          * interrupted, so we would delay a waiter with higher
-        * priority as current->normal_prio.
+        * priority as task->normal_prio.
          *
          * Note: in the rare case of a SCHED_OTHER task changing
          * its priority and thus stealing the lock, next->task
-        * might be current:
+        * might be task:
          */
-       if (likely(next->task != current)) {
-               spin_lock_irqsave(&current->pi_lock, flags);
-               plist_add(&next->pi_list_entry, &current->pi_waiters);
-               __rt_mutex_adjust_prio(current);
-               spin_unlock_irqrestore(&current->pi_lock, flags);
+       if (likely(next->task != task)) {
+               spin_lock_irqsave(&task->pi_lock, flags);
+               plist_add(&next->pi_list_entry, &task->pi_waiters);
+               __rt_mutex_adjust_prio(task);
+               spin_unlock_irqrestore(&task->pi_lock, flags);
         }
         return 1;
  }
@@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
          */
         mark_rt_mutex_waiters(lock);
  
-       if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
+       if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
                 return 0;
  
         /* We got the lock. */
@@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
   */
  static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
                                    struct rt_mutex_waiter *waiter,
+                                  struct task_struct *task,
                                    int detect_deadlock)
  {
         struct task_struct *owner = rt_mutex_owner(lock);
@@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
         unsigned long flags;
         int chain_walk = 0, res;
  
-       spin_lock_irqsave(&current->pi_lock, flags);
-       __rt_mutex_adjust_prio(current);
-       waiter->task = current;
+       spin_lock_irqsave(&task->pi_lock, flags);
+       __rt_mutex_adjust_prio(task);
+       waiter->task = task;
         waiter->lock = lock;
-       plist_node_init(&waiter->list_entry, current->prio);
-       plist_node_init(&waiter->pi_list_entry, current->prio);
+       plist_node_init(&waiter->list_entry, task->prio);
+       plist_node_init(&waiter->pi_list_entry, task->prio);
  
         /* Get the top priority waiter on the lock */
         if (rt_mutex_has_waiters(lock))
                 top_waiter = rt_mutex_top_waiter(lock);
         plist_add(&waiter->list_entry, &lock->wait_list);
  
-       current->pi_blocked_on = waiter;
+       task->pi_blocked_on = waiter;
  
-       spin_unlock_irqrestore(&current->pi_lock, flags);
+       spin_unlock_irqrestore(&task->pi_lock, flags);
  
         if (waiter == rt_mutex_top_waiter(lock)) {
                 spin_lock_irqsave(&owner->pi_lock, flags);
@@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
         spin_unlock(&lock->wait_lock);
  
         res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
-                                        current);
+                                        task);
  
         spin_lock(&lock->wait_lock);
  
@@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task)
         rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
  }
  
-/*
- * Slow path lock function:
+/**
+ * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
+ * @lock:               the rt_mutex to take
+ * @state:              the state the task should block in (TASK_INTERRUPTIBLE
+ *                      or TASK_UNINTERRUPTIBLE)
+ * @timeout:            the pre-initialized and started timer, or NULL for none
+ * @waiter:             the pre-initialized rt_mutex_waiter
+ * @detect_deadlock:    passed to task_blocks_on_rt_mutex
+ *
+ * lock->wait_lock must be held by the caller.
   */
  static int __sched
-rt_mutex_slowlock(struct rt_mutex *lock, int state,
-                 struct hrtimer_sleeper *timeout,
-                 int detect_deadlock)
+__rt_mutex_slowlock(struct rt_mutex *lock, int state,
+                   struct hrtimer_sleeper *timeout,
+                   struct rt_mutex_waiter *waiter,
+                   int detect_deadlock)
  {
-       struct rt_mutex_waiter waiter;
         int ret = 0;
  
-       debug_rt_mutex_init_waiter(&waiter);
-       waiter.task = NULL;
-
-       spin_lock(&lock->wait_lock);
-
-       /* Try to acquire the lock again: */
-       if (try_to_take_rt_mutex(lock)) {
-               spin_unlock(&lock->wait_lock);
-               return 0;
-       }
-
-       set_current_state(state);
-
-       /* Setup the timer, when timeout != NULL */
-       if (unlikely(timeout)) {
-               hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-               if (!hrtimer_active(&timeout->timer))
-                       timeout->task = NULL;
-       }
-
         for (;;) {
                 /* Try to acquire the lock: */
                 if (try_to_take_rt_mutex(lock))
@@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
                 }
  
                 /*
-                * waiter.task is NULL the first time we come here and
+                * waiter->task is NULL the first time we come here and
                  * when we have been woken up by the previous owner
                  * but the lock got stolen by a higher prio task.
                  */
-               if (!waiter.task) {
-                       ret = task_blocks_on_rt_mutex(lock, &waiter,
+               if (!waiter->task) {
+                       ret = task_blocks_on_rt_mutex(lock, waiter, current,
                                                       detect_deadlock);
                         /*
                          * If we got woken up by the owner then start loop
                          * all over without going into schedule to try
                          * to get the lock now:
                          */
-                       if (unlikely(!waiter.task)) {
+                       if (unlikely(!waiter->task)) {
                                 /*
                                  * Reset the return value. We might
                                  * have returned with -EDEADLK and the
@@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
  
                 spin_unlock(&lock->wait_lock);
  
-               debug_rt_mutex_print_deadlock(&waiter);
+               debug_rt_mutex_print_deadlock(waiter);
  
-               if (waiter.task)
+               if (waiter->task)
                         schedule_rt_mutex(lock);
  
                 spin_lock(&lock->wait_lock);
                 set_current_state(state);
         }
  
+       return ret;
+}
+
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+                 struct hrtimer_sleeper *timeout,
+                 int detect_deadlock)
+{
+       struct rt_mutex_waiter waiter;
+       int ret = 0;
+
+       debug_rt_mutex_init_waiter(&waiter);
+       waiter.task = NULL;
+
+       spin_lock(&lock->wait_lock);
+
+       /* Try to acquire the lock again: */
+       if (try_to_take_rt_mutex(lock)) {
+               spin_unlock(&lock->wait_lock);
+               return 0;
+       }
+
+       set_current_state(state);
+
+       /* Setup the timer, when timeout != NULL */
+       if (unlikely(timeout)) {
+               hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+               if (!hrtimer_active(&timeout->timer))
+                       timeout->task = NULL;
+       }
+
+       ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
+                                 detect_deadlock);
+
         set_current_state(TASK_RUNNING);
  
         if (unlikely(waiter.task))
@@ -864,9 +891,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
  EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
  
  /**
- * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
- *                                    the timeout structure is provided
- *                                    by the caller
+ * rt_mutex_timed_lock - lock a rt_mutex interruptible
+ *                     the timeout structure is provided
+ *                     by the caller
   *
   * @lock:              the rt_mutex to be locked
   * @timeout:           timeout structure or NULL (no timeout)
@@ -913,7 +940,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
  }
  EXPORT_SYMBOL_GPL(rt_mutex_unlock);
  
-/***
+/**
   * rt_mutex_destroy - mark a mutex unusable
   * @lock: the mutex to be destroyed
   *
@@ -985,6 +1012,59 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
         rt_mutex_deadlock_account_unlock(proxy_owner);
  }
  
+/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock:              the rt_mutex to take
+ * @waiter:            the pre-initialized rt_mutex_waiter
+ * @task:              the task to prepare
+ * @detect_deadlock:   perform deadlock detection (1) or not (0)
+ *
+ * Returns:
+ *  0 - task blocked on lock
+ *  1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for FUTEX_REQUEUE_PI support.
+ */
+int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                             struct rt_mutex_waiter *waiter,
+                             struct task_struct *task, int detect_deadlock)
+{
+       int ret;
+
+       spin_lock(&lock->wait_lock);
+
+       mark_rt_mutex_waiters(lock);
+
+       if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
+               /* We got the lock for task. */
+               debug_rt_mutex_lock(lock);
+
+               rt_mutex_set_owner(lock, task, 0);
+
+               rt_mutex_deadlock_account_lock(lock, task);
+               return 1;
+       }
+
+       ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
+
+
+       if (ret && !waiter->task) {
+               /*
+                * Reset the return value. We might have
+                * returned with -EDEADLK and the owner
+                * released the lock while we were walking the
+                * pi chain.  Let the waiter sort it out.
+                */
+               ret = 0;
+       }
+       spin_unlock(&lock->wait_lock);
+
+       debug_rt_mutex_print_deadlock(waiter);
+
+       return ret;
+}
+
  /**
   * rt_mutex_next_owner - return the next owner of the lock
   *
@@ -1004,3 +1084,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
  
         return rt_mutex_top_waiter(lock)->task;
  }
+
+/**
+ * rt_mutex_finish_proxy_lock() - Complete lock acquisition
+ * @lock:              the rt_mutex we were woken on
+ * @to:                        the timeout, null if none. hrtimer should already have
+ *                     been started.
+ * @waiter:            the pre-initialized rt_mutex_waiter
+ * @detect_deadlock:   perform deadlock detection (1) or not (0)
+ *
+ * Complete the lock acquisition started our behalf by another thread.
+ *
+ * Returns:
+ *  0 - success
+ * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK
+ *
+ * Special API call for PI-futex requeue support
+ */
+int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+                              struct hrtimer_sleeper *to,
+                              struct rt_mutex_waiter *waiter,
+                              int detect_deadlock)
+{
+       int ret;
+
+       spin_lock(&lock->wait_lock);
+
+       set_current_state(TASK_INTERRUPTIBLE);
+
+       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
+                                 detect_deadlock);
+
+       set_current_state(TASK_RUNNING);
+
+       if (unlikely(waiter->task))
+               remove_waiter(lock, waiter);
+
+       /*
+        * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+        * have to fix that up.
+        */
+       fixup_rt_mutex_waiters(lock);
+
+       spin_unlock(&lock->wait_lock);
+
+       /*
+        * Readjust priority, when we did not get the lock. We might have been
+        * the pending owner and boosted. Since we did not take the lock, the
+        * PI boost has to go.
+        */
+       if (unlikely(ret))
+               rt_mutex_adjust_prio(current);
+
+       return ret;
+}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h

index e124bf5800ea140ceeb936c7cba6429fea0e49b5..97a2f81866afdb6507607c4a30348f3f74564e0e 100644 (file)
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
                                        struct task_struct *proxy_owner);
  extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
                                   struct task_struct *proxy_owner);
+extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                                    struct rt_mutex_waiter *waiter,
+                                    struct task_struct *task,
+                                    int detect_deadlock);
+extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+                                     struct hrtimer_sleeper *to,
+                                     struct rt_mutex_waiter *waiter,
+                                     int detect_deadlock);
  
  #ifdef CONFIG_DEBUG_RT_MUTEXES
  # include "rtmutex-debug.h"
diff --git a/kernel/sched.c b/kernel/sched.c

index 26efa475bdc143f6e4459067c18ce57e71608764..5b3f6ec1b0b32db29ec326d2bf97021baad963fc 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
  #include <linux/completion.h>
  #include <linux/kernel_stat.h>
  #include <linux/debug_locks.h>
+#include <linux/perf_counter.h>
  #include <linux/security.h>
  #include <linux/notifier.h>
  #include <linux/profile.h>
@@ -72,13 +73,15 @@
  #include <linux/debugfs.h>
  #include <linux/ctype.h>
  #include <linux/ftrace.h>
-#include <trace/sched.h>
  
  #include <asm/tlb.h>
  #include <asm/irq_regs.h>
  
  #include "sched_cpupri.h"
  
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+
  /*
   * Convert user-nice values [ -20 ... 0 ... 19 ]
   * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -118,12 +121,6 @@
   */
  #define RUNTIME_INF    ((u64)~0ULL)
  
-DEFINE_TRACE(sched_wait_task);
-DEFINE_TRACE(sched_wakeup);
-DEFINE_TRACE(sched_wakeup_new);
-DEFINE_TRACE(sched_switch);
-DEFINE_TRACE(sched_migrate_task);
-
  #ifdef CONFIG_SMP
  
  static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -584,6 +581,7 @@ struct rq {
         struct load_weight load;
         unsigned long nr_load_updates;
         u64 nr_switches;
+       u64 nr_migrations_in;
  
         struct cfs_rq cfs;
         struct rt_rq rt;
@@ -630,6 +628,10 @@ struct rq {
         struct list_head migration_queue;
  #endif
  
+       /* calc_load related fields */
+       unsigned long calc_load_update;
+       long calc_load_active;
+
  #ifdef CONFIG_SCHED_HRTICK
  #ifdef CONFIG_SMP
         int hrtick_csd_pending;
@@ -692,7 +694,7 @@ static inline int cpu_of(struct rq *rq)
  #define task_rq(p)             cpu_rq(task_cpu(p))
  #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
  
-static inline void update_rq_clock(struct rq *rq)
+inline void update_rq_clock(struct rq *rq)
  {
         rq->clock = sched_clock_cpu(cpu_of(rq));
  }
@@ -1728,6 +1730,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
  }
  #endif
  
+static void calc_load_account_active(struct rq *this_rq);
+
  #include "sched_stats.h"
  #include "sched_idletask.c"
  #include "sched_fair.c"
@@ -1958,7 +1962,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
  
         clock_offset = old_rq->clock - new_rq->clock;
  
-       trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+       trace_sched_migrate_task(p, new_cpu);
  
  #ifdef CONFIG_SCHEDSTATS
         if (p->se.wait_start)
@@ -1967,12 +1971,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                 p->se.sleep_start -= clock_offset;
         if (p->se.block_start)
                 p->se.block_start -= clock_offset;
+#endif
         if (old_cpu != new_cpu) {
-               schedstat_inc(p, se.nr_migrations);
+               p->se.nr_migrations++;
+               new_rq->nr_migrations_in++;
+#ifdef CONFIG_SCHEDSTATS
                 if (task_hot(p, old_rq->clock, NULL))
                         schedstat_inc(p, se.nr_forced2_migrations);
-       }
  #endif
+               perf_counter_task_migration(p, new_cpu);
+       }
         p->se.vruntime -= old_cfsrq->min_vruntime -
                                          new_cfsrq->min_vruntime;
  
@@ -2014,6 +2022,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
         return 1;
  }
  
+/*
+ * wait_task_context_switch -  wait for a thread to complete at least one
+ *                             context switch.
+ *
+ * @p must not be current.
+ */
+void wait_task_context_switch(struct task_struct *p)
+{
+       unsigned long nvcsw, nivcsw, flags;
+       int running;
+       struct rq *rq;
+
+       nvcsw   = p->nvcsw;
+       nivcsw  = p->nivcsw;
+       for (;;) {
+               /*
+                * The runqueue is assigned before the actual context
+                * switch. We need to take the runqueue lock.
+                *
+                * We could check initially without the lock but it is
+                * very likely that we need to take the lock in every
+                * iteration.
+                */
+               rq = task_rq_lock(p, &flags);
+               running = task_running(rq, p);
+               task_rq_unlock(rq, &flags);
+
+               if (likely(!running))
+                       break;
+               /*
+                * The switch count is incremented before the actual
+                * context switch. We thus wait for two switches to be
+                * sure at least one completed.
+                */
+               if ((p->nvcsw - nvcsw) > 1)
+                       break;
+               if ((p->nivcsw - nivcsw) > 1)
+                       break;
+
+               cpu_relax();
+       }
+}
+
  /*
   * wait_task_inactive - wait for a thread to unschedule.
   *
@@ -2324,6 +2375,27 @@ static int sched_balance_self(int cpu, int flag)
  
  #endif /* CONFIG_SMP */
  
+/**
+ * task_oncpu_function_call - call a function on the cpu on which a task runs
+ * @p:         the task to evaluate
+ * @func:      the function to be called
+ * @info:      the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ */
+void task_oncpu_function_call(struct task_struct *p,
+                             void (*func) (void *info), void *info)
+{
+       int cpu;
+
+       preempt_disable();
+       cpu = task_cpu(p);
+       if (task_curr(p))
+               smp_call_function_single(cpu, func, info, 1);
+       preempt_enable();
+}
+
  /***
   * try_to_wake_up - wake up a thread
   * @p: the to-be-woken-up thread
@@ -2458,6 +2530,17 @@ out:
         return success;
  }
  
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.  Returns 1 if the process was woken up, 0 if it was already
+ * running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
  int wake_up_process(struct task_struct *p)
  {
         return try_to_wake_up(p, TASK_ALL, 0);
@@ -2480,6 +2563,7 @@ static void __sched_fork(struct task_struct *p)
         p->se.exec_start                = 0;
         p->se.sum_exec_runtime          = 0;
         p->se.prev_sum_exec_runtime     = 0;
+       p->se.nr_migrations             = 0;
         p->se.last_wakeup               = 0;
         p->se.avg_overlap               = 0;
         p->se.start_runtime             = 0;
@@ -2710,6 +2794,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
          */
         prev_state = prev->state;
         finish_arch_switch(prev);
+       perf_counter_task_sched_in(current, cpu_of(rq));
         finish_lock_switch(rq, prev);
  #ifdef CONFIG_SMP
         if (post_schedule)
@@ -2766,7 +2851,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
          * combine the page table reload and the switch backend into
          * one hypercall.
          */
-       arch_enter_lazy_cpu_mode();
+       arch_start_context_switch(prev);
  
         if (unlikely(!mm)) {
                 next->active_mm = oldmm;
@@ -2856,19 +2941,81 @@ unsigned long nr_iowait(void)
         return sum;
  }
  
-unsigned long nr_active(void)
+/* Variables and functions for calc_load */
+static atomic_long_t calc_load_tasks;
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+/**
+ * get_avenrun - get the load average array
+ * @loads:     pointer to dest load array
+ * @offset:    offset to add
+ * @shift:     shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
  {
-       unsigned long i, running = 0, uninterruptible = 0;
+       loads[0] = (avenrun[0] + offset) << shift;
+       loads[1] = (avenrun[1] + offset) << shift;
+       loads[2] = (avenrun[2] + offset) << shift;
+}
  
-       for_each_online_cpu(i) {
-               running += cpu_rq(i)->nr_running;
-               uninterruptible += cpu_rq(i)->nr_uninterruptible;
-       }
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+       load *= exp;
+       load += active * (FIXED_1 - exp);
+       return load >> FSHIFT;
+}
  
-       if (unlikely((long)uninterruptible < 0))
-               uninterruptible = 0;
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(void)
+{
+       unsigned long upd = calc_load_update + 10;
+       long active;
  
-       return running + uninterruptible;
+       if (time_before(jiffies, upd))
+               return;
+
+       active = atomic_long_read(&calc_load_tasks);
+       active = active > 0 ? active * FIXED_1 : 0;
+
+       avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+       avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+       avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+       calc_load_update += LOAD_FREQ;
+}
+
+/*
+ * Either called from update_cpu_load() or from a cpu going idle
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+       long nr_active, delta;
+
+       nr_active = this_rq->nr_running;
+       nr_active += (long) this_rq->nr_uninterruptible;
+
+       if (nr_active != this_rq->calc_load_active) {
+               delta = nr_active - this_rq->calc_load_active;
+               this_rq->calc_load_active = nr_active;
+               atomic_long_add(delta, &calc_load_tasks);
+       }
+}
+
+/*
+ * Externally visible per-cpu scheduler statistics:
+ * cpu_nr_migrations(cpu) - number of migrations into that cpu
+ */
+u64 cpu_nr_migrations(int cpu)
+{
+       return cpu_rq(cpu)->nr_migrations_in;
  }
  
  /*
@@ -2899,6 +3046,11 @@ static void update_cpu_load(struct rq *this_rq)
                         new_load += scale-1;
                 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
         }
+
+       if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+               this_rq->calc_load_update += LOAD_FREQ;
+               calc_load_account_active(this_rq);
+       }
  }
  
  #ifdef CONFIG_SMP
@@ -4240,10 +4392,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
  static struct {
         atomic_t load_balancer;
         cpumask_var_t cpu_mask;
+       cpumask_var_t ilb_grp_nohz_mask;
  } nohz ____cacheline_aligned = {
         .load_balancer = ATOMIC_INIT(-1),
  };
  
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:       The cpu whose lowest level of sched domain is to
+ *             be returned.
+ * @flag:      The flag to check for the lowest sched_domain
+ *             for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+       struct sched_domain *sd;
+
+       for_each_domain(cpu, sd)
+               if (sd && (sd->flags & flag))
+                       break;
+
+       return sd;
+}
+
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:       The cpu whose domains we're iterating over.
+ * @sd:                variable holding the value of the power_savings_sd
+ *             for cpu.
+ * @flag:      The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+       for (sd = lowest_flag_domain(cpu, flag); \
+               (sd && (sd->flags & flag)); sd = sd->parent)
+
+/**
+ * is_semi_idle_group - Checks if the given sched_group is semi-idle.
+ * @ilb_group: group to be checked for semi-idleness
+ *
+ * Returns:    1 if the group is semi-idle. 0 otherwise.
+ *
+ * We define a sched_group to be semi idle if it has atleast one idle-CPU
+ * and atleast one non-idle CPU. This helper function checks if the given
+ * sched_group is semi-idle or not.
+ */
+static inline int is_semi_idle_group(struct sched_group *ilb_group)
+{
+       cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+                                       sched_group_cpus(ilb_group));
+
+       /*
+        * A sched_group is semi-idle when it has atleast one busy cpu
+        * and atleast one idle cpu.
+        */
+       if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+               return 0;
+
+       if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+               return 0;
+
+       return 1;
+}
+/**
+ * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ * @cpu:       The cpu which is nominating a new idle_load_balancer.
+ *
+ * Returns:    Returns the id of the idle load balancer if it exists,
+ *             Else, returns >= nr_cpu_ids.
+ *
+ * This algorithm picks the idle load balancer such that it belongs to a
+ * semi-idle powersavings sched_domain. The idea is to try and avoid
+ * completely idle packages/cores just for the purpose of idle load balancing
+ * when there are other idle cpu's which are better suited for that job.
+ */
+static int find_new_ilb(int cpu)
+{
+       struct sched_domain *sd;
+       struct sched_group *ilb_group;
+
+       /*
+        * Have idle load balancer selection from semi-idle packages only
+        * when power-aware load balancing is enabled
+        */
+       if (!(sched_smt_power_savings || sched_mc_power_savings))
+               goto out_done;
+
+       /*
+        * Optimize for the case when we have no idle CPUs or only one
+        * idle CPU. Don't walk the sched_domain hierarchy in such cases
+        */
+       if (cpumask_weight(nohz.cpu_mask) < 2)
+               goto out_done;
+
+       for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+               ilb_group = sd->groups;
+
+               do {
+                       if (is_semi_idle_group(ilb_group))
+                               return cpumask_first(nohz.ilb_grp_nohz_mask);
+
+                       ilb_group = ilb_group->next;
+
+               } while (ilb_group != sd->groups);
+       }
+
+out_done:
+       return cpumask_first(nohz.cpu_mask);
+}
+#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+static inline int find_new_ilb(int call_cpu)
+{
+       return cpumask_first(nohz.cpu_mask);
+}
+#endif
+
  /*
   * This routine will try to nominate the ilb (idle load balancing)
   * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4298,8 +4566,24 @@ int select_nohz_load_balancer(int stop_tick)
                         /* make me the ilb owner */
                         if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
                                 return 1;
-               } else if (atomic_read(&nohz.load_balancer) == cpu)
+               } else if (atomic_read(&nohz.load_balancer) == cpu) {
+                       int new_ilb;
+
+                       if (!(sched_smt_power_savings ||
+                                               sched_mc_power_savings))
+                               return 1;
+                       /*
+                        * Check to see if there is a more power-efficient
+                        * ilb.
+                        */
+                       new_ilb = find_new_ilb(cpu);
+                       if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+                               atomic_set(&nohz.load_balancer, -1);
+                               resched_cpu(new_ilb);
+                               return 0;
+                       }
                         return 1;
+               }
         } else {
                 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
                         return 0;
@@ -4468,15 +4752,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
                 }
  
                 if (atomic_read(&nohz.load_balancer) == -1) {
-                       /*
-                        * simple selection for now: Nominate the
-                        * first cpu in the nohz list to be the next
-                        * ilb owner.
-                        *
-                        * TBD: Traverse the sched domains and nominate
-                        * the nearest cpu in the nohz.cpu_mask.
-                        */
-                       int ilb = cpumask_first(nohz.cpu_mask);
+                       int ilb = find_new_ilb(cpu);
  
                         if (ilb < nr_cpu_ids)
                                 resched_cpu(ilb);
@@ -4840,6 +5116,8 @@ void scheduler_tick(void)
         curr->sched_class->task_tick(rq, curr, 0);
         spin_unlock(&rq->lock);
  
+       perf_counter_task_tick(curr, cpu);
+
  #ifdef CONFIG_SMP
         rq->idle_at_tick = idle_cpu(cpu);
         trigger_load_balance(rq, cpu);
@@ -5007,13 +5285,15 @@ pick_next_task(struct rq *rq)
  /*
   * schedule() is the main scheduler function.
   */
-asmlinkage void __sched __schedule(void)
+asmlinkage void __sched schedule(void)
  {
         struct task_struct *prev, *next;
         unsigned long *switch_count;
         struct rq *rq;
         int cpu;
  
+need_resched:
+       preempt_disable();
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
         rcu_qsctr_inc(cpu);
@@ -5053,6 +5333,7 @@ need_resched_nonpreemptible:
  
         if (likely(prev != next)) {
                 sched_info_switch(prev, next);
+               perf_counter_task_sched_out(prev, next, cpu);
  
                 rq->nr_switches++;
                 rq->curr = next;
@@ -5070,15 +5351,9 @@ need_resched_nonpreemptible:
  
         if (unlikely(reacquire_kernel_lock(current) < 0))
                 goto need_resched_nonpreemptible;
-}
  
-asmlinkage void __sched schedule(void)
-{
-need_resched:
-       preempt_disable();
-       __schedule();
         preempt_enable_no_resched();
-       if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+       if (need_resched())
                 goto need_resched;
  }
  EXPORT_SYMBOL(schedule);
@@ -5221,7 +5496,7 @@ EXPORT_SYMBOL(default_wake_function);
   * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
   * zero in this (rare) case, and we handle it by continuing to scan the queue.
   */
-void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
                         int nr_exclusive, int sync, void *key)
  {
         wait_queue_t *curr, *next;
@@ -5241,6 +5516,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
   * @mode: which threads
   * @nr_exclusive: how many wake-one or wake-many threads to wake up
   * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
   */
  void __wake_up(wait_queue_head_t *q, unsigned int mode,
                         int nr_exclusive, void *key)
@@ -5279,6 +5557,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
   * with each other. This can prevent needless bouncing between CPUs.
   *
   * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
   */
  void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
                         int nr_exclusive, void *key)
@@ -5315,6 +5596,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync);        /* For internal use only */
   * awakened in the same order in which they were queued.
   *
   * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
   */
  void complete(struct completion *x)
  {
@@ -5332,6 +5616,9 @@ EXPORT_SYMBOL(complete);
   * @x:  holds the state of this particular completion
   *
   * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
   */
  void complete_all(struct completion *x)
  {
@@ -6490,8 +6777,9 @@ void sched_show_task(struct task_struct *p)
  #ifdef CONFIG_DEBUG_STACK_USAGE
         free = stack_not_used(p);
  #endif
-       printk(KERN_CONT "%5lu %5d %6d\n", free,
-               task_pid_nr(p), task_pid_nr(p->real_parent));
+       printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+               task_pid_nr(p), task_pid_nr(p->real_parent),
+               (unsigned long)task_thread_info(p)->flags);
  
         show_stack(p, NULL);
  }
@@ -6970,6 +7258,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
  
         }
  }
+
+/*
+ * remove the tasks which were accounted by rq from calc_load_tasks.
+ */
+static void calc_global_load_remove(struct rq *rq)
+{
+       atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+}
  #endif /* CONFIG_HOTPLUG_CPU */
  
  #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7204,6 +7500,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 /* Update our root-domain */
                 rq = cpu_rq(cpu);
                 spin_lock_irqsave(&rq->lock, flags);
+               rq->calc_load_update = calc_load_update;
+               rq->calc_load_active = 0;
                 if (rq->rd) {
                         BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
  
@@ -7243,7 +7541,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 cpuset_unlock();
                 migrate_nr_uninterruptible(rq);
                 BUG_ON(rq->nr_running != 0);
-
+               calc_global_load_remove(rq);
                 /*
                  * No need to migrate the tasks: it was best-effort if
                  * they didn't take sched_hotcpu_mutex. Just wake up
@@ -7279,8 +7577,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
         return NOTIFY_OK;
  }
  
-/* Register at highest priority so that task migration (migrate_all_tasks)
- * happens before everything else.
+/*
+ * Register at high priority so that task migration (migrate_all_tasks)
+ * happens before everything else.  This has to be lower priority than
+ * the notifier in the perf_counter subsystem, though.
   */
  static struct notifier_block __cpuinitdata migration_notifier = {
         .notifier_call = migration_call,
@@ -7753,8 +8053,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
  
  /*
   * The cpus mask in sched_group and sched_domain hangs off the end.
- * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
- * for nr_cpu_ids < CONFIG_NR_CPUS.
+ *
+ * ( See the the comments in include/linux/sched.h:struct sched_group
+ *   and struct sched_domain. )
   */
  struct static_sched_group {
         struct sched_group sg;
@@ -7875,7 +8176,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
                         struct sched_domain *sd;
  
                         sd = &per_cpu(phys_domains, j).sd;
-                       if (j != cpumask_first(sched_group_cpus(sd->groups))) {
+                       if (j != group_first_cpu(sd->groups)) {
                                 /*
                                  * Only add "power" once for each
                                  * physical package.
@@ -7953,7 +8254,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
  
         WARN_ON(!sd || !sd->groups);
  
-       if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
+       if (cpu != group_first_cpu(sd->groups))
                 return;
  
         child = sd->child;
@@ -8938,6 +9239,8 @@ void __init sched_init(void)
                 rq = cpu_rq(i);
                 spin_lock_init(&rq->lock);
                 rq->nr_running = 0;
+               rq->calc_load_active = 0;
+               rq->calc_load_update = jiffies + LOAD_FREQ;
                 init_cfs_rq(&rq->cfs, rq);
                 init_rt_rq(&rq->rt, rq);
  #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8958,7 +9261,7 @@ void __init sched_init(void)
                  * 1024) and two child groups A0 and A1 (of weight 1024 each),
                  * then A0's share of the cpu resource is:
                  *
-                *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+                *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
                  *
                  * We achieve this by letting init_task_group's tasks sit
                  * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -9045,6 +9348,9 @@ void __init sched_init(void)
          * when this runqueue becomes "idle".
          */
         init_idle(current, smp_processor_id());
+
+       calc_load_update = jiffies + LOAD_FREQ;
+
         /*
          * During early bootup we pretend to be a normal task:
          */
@@ -9055,10 +9361,13 @@ void __init sched_init(void)
  #ifdef CONFIG_SMP
  #ifdef CONFIG_NO_HZ
         alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+       alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
  #endif
         alloc_bootmem_cpumask_var(&cpu_isolated_map);
  #endif /* SMP */
  
+       perf_counter_init();
+
         scheduler_running = 1;
  }
  
@@ -9800,6 +10109,13 @@ static int sched_rt_global_constraints(void)
         if (sysctl_sched_rt_period <= 0)
                 return -EINVAL;
  
+       /*
+        * There's always some RT tasks in the root group
+        * -- migration, kstopmachine etc..
+        */
+       if (sysctl_sched_rt_runtime == 0)
+               return -EBUSY;
+
         spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
         for_each_possible_cpu(i) {
                 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index 3816f217f119ef635cf169f268b5140c6e0f66c6..5f9650e8fe7587938de1bfa71e48cd4a04061d36 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
  
         find_matching_se(&se, &pse);
  
-       while (se) {
-               BUG_ON(!pse);
+       BUG_ON(!pse);
  
-               if (wakeup_preempt_entity(se, pse) == 1) {
-                       resched_task(curr);
-                       break;
-               }
-
-               se = parent_entity(se);
-               pse = parent_entity(pse);
-       }
+       if (wakeup_preempt_entity(se, pse) == 1)
+               resched_task(curr);
  }
  
  static struct task_struct *pick_next_task_fair(struct rq *rq)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c

index 8a21a2e28c13ac2e4620e980888702e22a4f2387..499672c10cbd615141a362cafe804711bbcf1ff3 100644 (file)
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy
  static struct task_struct *pick_next_task_idle(struct rq *rq)
  {
         schedstat_inc(rq, sched_goidle);
-
+       /* adjust the active tasks as we might go into a long sleep */
+       calc_load_account_active(rq);
         return rq->idle;
  }
  
diff --git a/kernel/signal.c b/kernel/signal.c

index d8034737db4cb86f8adca044560a3765ac65dba5..dba6ae99978a0f1b2a2058b1d31c09b10c1b0ab9 100644 (file)
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -27,7 +27,7 @@
  #include <linux/freezer.h>
  #include <linux/pid_namespace.h>
  #include <linux/nsproxy.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
  
  #include <asm/param.h>
  #include <asm/uaccess.h>
@@ -41,8 +41,6 @@
  
  static struct kmem_cache *sigqueue_cachep;
  
-DEFINE_TRACE(sched_signal_send);
-
  static void __user *sig_handler(struct task_struct *t, int sig)
  {
         return t->sighand->action[sig - 1].sa.sa_handler;
@@ -2278,24 +2276,17 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
         return kill_something_info(sig, &info, pid);
  }
  
-static int do_tkill(pid_t tgid, pid_t pid, int sig)
+static int
+do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
  {
-       int error;
-       struct siginfo info;
         struct task_struct *p;
         unsigned long flags;
-
-       error = -ESRCH;
-       info.si_signo = sig;
-       info.si_errno = 0;
-       info.si_code = SI_TKILL;
-       info.si_pid = task_tgid_vnr(current);
-       info.si_uid = current_uid();
+       int error = -ESRCH;
  
         rcu_read_lock();
         p = find_task_by_vpid(pid);
         if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
-               error = check_kill_permission(sig, &info, p);
+               error = check_kill_permission(sig, info, p);
                 /*
                  * The null signal is a permissions and process existence
                  * probe.  No signal is actually delivered.
@@ -2305,7 +2296,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
                  * signal is private anyway.
                  */
                 if (!error && sig && lock_task_sighand(p, &flags)) {
-                       error = specific_send_sig_info(sig, &info, p);
+                       error = specific_send_sig_info(sig, info, p);
                         unlock_task_sighand(p, &flags);
                 }
         }
@@ -2314,6 +2305,19 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
         return error;
  }
  
+static int do_tkill(pid_t tgid, pid_t pid, int sig)
+{
+       struct siginfo info;
+
+       info.si_signo = sig;
+       info.si_errno = 0;
+       info.si_code = SI_TKILL;
+       info.si_pid = task_tgid_vnr(current);
+       info.si_uid = current_uid();
+
+       return do_send_specific(tgid, pid, sig, &info);
+}
+
  /**
   *  sys_tgkill - send signal to one specific thread
   *  @tgid: the thread group ID of the thread
@@ -2363,6 +2367,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
         return kill_proc_info(sig, &info, pid);
  }
  
+long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
+{
+       /* This is only valid for single tasks */
+       if (pid <= 0 || tgid <= 0)
+               return -EINVAL;
+
+       /* Not even root can pretend to send signals from the kernel.
+          Nor can they impersonate a kill(), which adds source info.  */
+       if (info->si_code >= 0)
+               return -EPERM;
+       info->si_signo = sig;
+
+       return do_send_specific(tgid, pid, sig, info);
+}
+
+SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
+               siginfo_t __user *, uinfo)
+{
+       siginfo_t info;
+
+       if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
+               return -EFAULT;
+
+       return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+
  int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
  {
         struct task_struct *t = current;
diff --git a/kernel/softirq.c b/kernel/softirq.c

index b525dd348511b0bda3b08c19ffef6df0cf053206..258885a543db60692a695aa682abb6c95f3ffcdf 100644 (file)
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -24,7 +24,9 @@
  #include <linux/ftrace.h>
  #include <linux/smp.h>
  #include <linux/tick.h>
-#include <trace/irq.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/irq.h>
  
  #include <asm/irq.h>
  /*
@@ -186,9 +188,6 @@ EXPORT_SYMBOL(local_bh_enable_ip);
   */
  #define MAX_SOFTIRQ_RESTART 10
  
-DEFINE_TRACE(softirq_entry);
-DEFINE_TRACE(softirq_exit);
-
  asmlinkage void __do_softirq(void)
  {
         struct softirq_action *h;
@@ -828,7 +827,7 @@ int __init __weak arch_early_irq_init(void)
         return 0;
  }
  
-int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
+int __weak arch_init_chip_data(struct irq_desc *desc, int node)
  {
         return 0;
  }
diff --git a/kernel/sys.c b/kernel/sys.c

index e7998cf314986fd37adbf6ccf1aa65ea5cb3982d..438d99a38c87f343dd318c3966799ec73bb6072a 100644 (file)
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
  #include <linux/prctl.h>
  #include <linux/highuid.h>
  #include <linux/fs.h>
+#include <linux/perf_counter.h>
  #include <linux/resource.h>
  #include <linux/kernel.h>
  #include <linux/kexec.h>
@@ -1793,6 +1794,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                 case PR_SET_TSC:
                         error = SET_TSC_CTL(arg2);
                         break;
+               case PR_TASK_PERF_COUNTERS_DISABLE:
+                       error = perf_counter_task_disable();
+                       break;
+               case PR_TASK_PERF_COUNTERS_ENABLE:
+                       error = perf_counter_task_enable();
+                       break;
                 case PR_GET_TIMERSLACK:
                         error = current->timer_slack_ns;
                         break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c

index 27dad29673879ae78b7124280346d418e520572b..68320f6b07b51341218296ac170c6a9536a3a420 100644 (file)
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -175,3 +175,6 @@ cond_syscall(compat_sys_timerfd_settime);
  cond_syscall(compat_sys_timerfd_gettime);
  cond_syscall(sys_eventfd);
  cond_syscall(sys_eventfd2);
+
+/* performance counters: */
+cond_syscall(sys_perf_counter_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index b2970d56fb7678d6493e44c766db536f691656b7..a7e4eb0525b19e768cf6d729f0db3b07d5d35dd5 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -49,6 +49,7 @@
  #include <linux/reboot.h>
  #include <linux/ftrace.h>
  #include <linux/slow-work.h>
+#include <linux/perf_counter.h>
  
  #include <asm/uaccess.h>
  #include <asm/processor.h>
@@ -729,6 +730,14 @@ static struct ctl_table kern_table[] = {
                 .mode           = 0444,
                 .proc_handler   = &proc_dointvec,
         },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "bootloader_version",
+               .data           = &bootloader_version,
+               .maxlen         = sizeof (int),
+               .mode           = 0444,
+               .proc_handler   = &proc_dointvec,
+       },
         {
                 .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "kstack_depth_to_print",
@@ -912,6 +921,32 @@ static struct ctl_table kern_table[] = {
                 .child          = slow_work_sysctls,
         },
  #endif
+#ifdef CONFIG_PERF_COUNTERS
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "perf_counter_paranoid",
+               .data           = &sysctl_perf_counter_paranoid,
+               .maxlen         = sizeof(sysctl_perf_counter_paranoid),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "perf_counter_mlock_kb",
+               .data           = &sysctl_perf_counter_mlock,
+               .maxlen         = sizeof(sysctl_perf_counter_mlock),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "perf_counter_max_sample_rate",
+               .data           = &sysctl_perf_counter_sample_rate,
+               .maxlen         = sizeof(sysctl_perf_counter_sample_rate),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+#endif
  /*
   * NOTE: do not add new entries to this table unless you have read
   * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c

index 687dff49f6e7da5ec92199f6bbae08de57e5ba43..52a8bf8931f3903128af16a13d32c5d4b8b376aa 100644 (file)
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
  
  /*
   * This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
+ * playing with xtime.
   */
  __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
  
diff --git a/kernel/timer.c b/kernel/timer.c

index cffffad01c31cf1670490f2860f0e5de0f25d301..c01e568935ea658a5b243e80bd6791fef99a5994 100644 (file)
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,7 @@
  #include <linux/delay.h>
  #include <linux/tick.h>
  #include <linux/kallsyms.h>
+#include <linux/perf_counter.h>
  
  #include <asm/uaccess.h>
  #include <asm/unistd.h>
@@ -1122,47 +1123,6 @@ void update_process_times(int user_tick)
         run_posix_cpu_timers(p);
  }
  
-/*
- * Nr of active tasks - counted in fixed-point numbers
- */
-static unsigned long count_active_tasks(void)
-{
-       return nr_active() * FIXED_1;
-}
-
-/*
- * Hmm.. Changed this, as the GNU make sources (load.c) seems to
- * imply that avenrun[] is the standard name for this kind of thing.
- * Nothing else seems to be standardized: the fractional size etc
- * all seem to differ on different machines.
- *
- * Requires xtime_lock to access.
- */
-unsigned long avenrun[3];
-
-EXPORT_SYMBOL(avenrun);
-
-/*
- * calc_load - given tick count, update the avenrun load estimates.
- * This is called while holding a write_lock on xtime_lock.
- */
-static inline void calc_load(unsigned long ticks)
-{
-       unsigned long active_tasks; /* fixed-point */
-       static int count = LOAD_FREQ;
-
-       count -= ticks;
-       if (unlikely(count < 0)) {
-               active_tasks = count_active_tasks();
-               do {
-                       CALC_LOAD(avenrun[0], EXP_1, active_tasks);
-                       CALC_LOAD(avenrun[1], EXP_5, active_tasks);
-                       CALC_LOAD(avenrun[2], EXP_15, active_tasks);
-                       count += LOAD_FREQ;
-               } while (count < 0);
-       }
-}
-
  /*
   * This function runs timers and the timer-tq in bottom half context.
   */
@@ -1170,6 +1130,8 @@ static void run_timer_softirq(struct softirq_action *h)
  {
         struct tvec_base *base = __get_cpu_var(tvec_bases);
  
+       perf_counter_do_pending();
+
         hrtimer_run_pending();
  
         if (time_after_eq(jiffies, base->timer_jiffies))
@@ -1186,16 +1148,6 @@ void run_local_timers(void)
         softlockup_tick();
  }
  
-/*
- * Called by the timer interrupt. xtime_lock must already be taken
- * by the timer IRQ!
- */
-static inline void update_times(unsigned long ticks)
-{
-       update_wall_time();
-       calc_load(ticks);
-}
-
  /*
   * The 64-bit jiffies value is not atomic - you MUST NOT read it
   * without sampling the sequence number in xtime_lock.
@@ -1205,7 +1157,8 @@ static inline void update_times(unsigned long ticks)
  void do_timer(unsigned long ticks)
  {
         jiffies_64 += ticks;
-       update_times(ticks);
+       update_wall_time();
+       calc_global_load();
  }
  
  #ifdef __ARCH_WANT_SYS_ALARM
@@ -1406,37 +1359,17 @@ int do_sysinfo(struct sysinfo *info)
  {
         unsigned long mem_total, sav_total;
         unsigned int mem_unit, bitcount;
-       unsigned long seq;
+       struct timespec tp;
  
         memset(info, 0, sizeof(struct sysinfo));
  
-       do {
-               struct timespec tp;
-               seq = read_seqbegin(&xtime_lock);
-
-               /*
-                * This is annoying.  The below is the same thing
-                * posix_get_clock_monotonic() does, but it wants to
-                * take the lock which we want to cover the loads stuff
-                * too.
-                */
-
-               getnstimeofday(&tp);
-               tp.tv_sec += wall_to_monotonic.tv_sec;
-               tp.tv_nsec += wall_to_monotonic.tv_nsec;
-               monotonic_to_bootbased(&tp);
-               if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
-                       tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
-                       tp.tv_sec++;
-               }
-               info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+       ktime_get_ts(&tp);
+       monotonic_to_bootbased(&tp);
+       info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
  
-               info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
-               info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
-               info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+       get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
  
-               info->procs = nr_threads;
-       } while (read_seqretry(&xtime_lock, seq));
+       info->procs = nr_threads;
  
         si_meminfo(info);
         si_swapinfo(info);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig

index 417d1985e29911784adbf506523462f009933c02..4a13e5a01ce318c62c75c991a8694096e4821f40 100644 (file)
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -48,6 +48,21 @@ config FTRACE_NMI_ENTER
         depends on HAVE_FTRACE_NMI_ENTER
         default y
  
+config EVENT_TRACING
+       select CONTEXT_SWITCH_TRACER
+       bool
+
+config CONTEXT_SWITCH_TRACER
+       select MARKERS
+       bool
+
+# All tracer options should select GENERIC_TRACER. For those options that are
+# enabled by all tracers (context switch and event tracer) they select TRACING.
+# This allows those options to appear when no other tracer is selected. But the
+# options do not appear when something else selects it. We need the two options
+# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
+# hidding of the automatic options options.
+
  config TRACING
         bool
         select DEBUG_FS
@@ -56,6 +71,11 @@ config TRACING
         select TRACEPOINTS
         select NOP_TRACER
         select BINARY_PRINTF
+       select EVENT_TRACING
+
+config GENERIC_TRACER
+       bool
+       select TRACING
  
  #
  # Minimum requirements an architecture has to meet for us to
@@ -73,14 +93,20 @@ config TRACING_SUPPORT
  
  if TRACING_SUPPORT
  
-menu "Tracers"
+menuconfig FTRACE
+       bool "Tracers"
+       default y if DEBUG_KERNEL
+       help
+        Enable the kernel tracing infrastructure.
+
+if FTRACE
  
  config FUNCTION_TRACER
         bool "Kernel Function Tracer"
         depends on HAVE_FUNCTION_TRACER
         select FRAME_POINTER
         select KALLSYMS
-       select TRACING
+       select GENERIC_TRACER
         select CONTEXT_SWITCH_TRACER
         help
           Enable the kernel to trace every kernel function. This is done
@@ -104,13 +130,14 @@ config FUNCTION_GRAPH_TRACER
           the return value. This is done by setting the current return 
           address on the current task structure into a stack of calls.
  
+
  config IRQSOFF_TRACER
         bool "Interrupts-off Latency Tracer"
         default n
         depends on TRACE_IRQFLAGS_SUPPORT
         depends on GENERIC_TIME
         select TRACE_IRQFLAGS
-       select TRACING
+       select GENERIC_TRACER
         select TRACER_MAX_TRACE
         help
           This option measures the time spent in irqs-off critical
@@ -131,7 +158,7 @@ config PREEMPT_TRACER
         default n
         depends on GENERIC_TIME
         depends on PREEMPT
-       select TRACING
+       select GENERIC_TRACER
         select TRACER_MAX_TRACE
         help
           This option measures the time spent in preemption off critical
@@ -150,7 +177,7 @@ config PREEMPT_TRACER
  config SYSPROF_TRACER
         bool "Sysprof Tracer"
         depends on X86
-       select TRACING
+       select GENERIC_TRACER
         select CONTEXT_SWITCH_TRACER
         help
           This tracer provides the trace needed by the 'Sysprof' userspace
@@ -158,40 +185,33 @@ config SYSPROF_TRACER
  
  config SCHED_TRACER
         bool "Scheduling Latency Tracer"
-       select TRACING
+       select GENERIC_TRACER
         select CONTEXT_SWITCH_TRACER
         select TRACER_MAX_TRACE
         help
           This tracer tracks the latency of the highest priority task
           to be scheduled in, starting from the point it has woken up.
  
-config CONTEXT_SWITCH_TRACER
-       bool "Trace process context switches"
-       select TRACING
-       select MARKERS
-       help
-         This tracer gets called from the context switch and records
-         all switching of tasks.
-
-config EVENT_TRACER
-       bool "Trace various events in the kernel"
+config ENABLE_DEFAULT_TRACERS
+       bool "Trace process context switches and events"
+       depends on !GENERIC_TRACER
         select TRACING
         help
           This tracer hooks to various trace points in the kernel
           allowing the user to pick and choose which trace point they
-         want to trace.
+         want to trace. It also includes the sched_switch tracer plugin.
  
  config FTRACE_SYSCALLS
         bool "Trace syscalls"
         depends on HAVE_FTRACE_SYSCALLS
-       select TRACING
+       select GENERIC_TRACER
         select KALLSYMS
         help
           Basic tracer to catch the syscall entry and exit events.
  
  config BOOT_TRACER
         bool "Trace boot initcalls"
-       select TRACING
+       select GENERIC_TRACER
         select CONTEXT_SWITCH_TRACER
         help
           This tracer helps developers to optimize boot times: it records
@@ -207,8 +227,36 @@ config BOOT_TRACER
           to enable this on bootup.
  
  config TRACE_BRANCH_PROFILING
+       bool
+       select GENERIC_TRACER
+
+choice
+       prompt "Branch Profiling"
+       default BRANCH_PROFILE_NONE
+       help
+        The branch profiling is a software profiler. It will add hooks
+        into the C conditionals to test which path a branch takes.
+
+        The likely/unlikely profiler only looks at the conditions that
+        are annotated with a likely or unlikely macro.
+
+        The "all branch" profiler will profile every if statement in the
+        kernel. This profiler will also enable the likely/unlikely
+        profiler as well.
+
+        Either of the above profilers add a bit of overhead to the system.
+        If unsure choose "No branch profiling".
+
+config BRANCH_PROFILE_NONE
+       bool "No branch profiling"
+       help
+        No branch profiling. Branch profiling adds a bit of overhead.
+        Only enable it if you want to analyse the branching behavior.
+        Otherwise keep it disabled.
+
+config PROFILE_ANNOTATED_BRANCHES
         bool "Trace likely/unlikely profiler"
-       select TRACING
+       select TRACE_BRANCH_PROFILING
         help
           This tracer profiles all the the likely and unlikely macros
           in the kernel. It will display the results in:
@@ -218,11 +266,9 @@ config TRACE_BRANCH_PROFILING
           Note: this will add a significant overhead, only turn this
           on if you need to profile the system's use of these macros.
  
-         Say N if unsure.
-
  config PROFILE_ALL_BRANCHES
         bool "Profile all if conditionals"
-       depends on TRACE_BRANCH_PROFILING
+       select TRACE_BRANCH_PROFILING
         help
           This tracer profiles all branch conditions. Every if ()
           taken in the kernel is recorded whether it hit or miss.
@@ -230,11 +276,12 @@ config PROFILE_ALL_BRANCHES
  
           /debugfs/tracing/profile_branch
  
+         This option also enables the likely/unlikely profiler.
+
           This configuration, when enabled, will impose a great overhead
           on the system. This should only be enabled when the system
           is to be analyzed
-
-         Say N if unsure.
+endchoice
  
  config TRACING_BRANCHES
         bool
@@ -261,7 +308,7 @@ config BRANCH_TRACER
  config POWER_TRACER
         bool "Trace power consumption behavior"
         depends on X86
-       select TRACING
+       select GENERIC_TRACER
         help
           This tracer helps developers to analyze and optimize the kernels
           power management decisions, specifically the C-state and P-state
@@ -295,14 +342,14 @@ config STACK_TRACER
  config HW_BRANCH_TRACER
         depends on HAVE_HW_BRANCH_TRACER
         bool "Trace hw branches"
-       select TRACING
+       select GENERIC_TRACER
         help
           This tracer records all branches on the system in a circular
           buffer giving access to the last N branches for each cpu.
  
  config KMEMTRACE
         bool "Trace SLAB allocations"
-       select TRACING
+       select GENERIC_TRACER
         help
           kmemtrace provides tracing for slab allocator functions, such as
           kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
@@ -322,7 +369,7 @@ config KMEMTRACE
  
  config WORKQUEUE_TRACER
         bool "Trace workqueues"
-       select TRACING
+       select GENERIC_TRACER
         help
           The workqueue tracer provides some statistical informations
            about each cpu workqueue thread such as the number of the
@@ -338,7 +385,7 @@ config BLK_DEV_IO_TRACE
         select RELAY
         select DEBUG_FS
         select TRACEPOINTS
-       select TRACING
+       select GENERIC_TRACER
         select STACKTRACE
         help
           Say Y here if you want to be able to trace the block layer actions
@@ -375,6 +422,20 @@ config DYNAMIC_FTRACE
          were made. If so, it runs stop_machine (stops all CPUS)
          and modifies the code to jump over the call to ftrace.
  
+config FUNCTION_PROFILER
+       bool "Kernel function profiler"
+       depends on FUNCTION_TRACER
+       default n
+       help
+        This option enables the kernel function profiler. A file is created
+        in debugfs called function_profile_enabled which defaults to zero.
+        When a 1 is echoed into this file profiling begins, and when a
+        zero is entered, profiling stops. A file in the trace_stats
+        directory called functions, that show the list of functions that
+        have been hit and their counters.
+
+        If in doubt, say N
+
  config FTRACE_MCOUNT_RECORD
         def_bool y
         depends on DYNAMIC_FTRACE
@@ -385,7 +446,7 @@ config FTRACE_SELFTEST
  
  config FTRACE_STARTUP_TEST
         bool "Perform a startup test on ftrace"
-       depends on TRACING
+       depends on GENERIC_TRACER
         select FTRACE_SELFTEST
         help
           This option performs a series of startup tests on ftrace. On bootup
@@ -396,7 +457,7 @@ config FTRACE_STARTUP_TEST
  config MMIOTRACE
         bool "Memory mapped IO tracing"
         depends on HAVE_MMIOTRACE_SUPPORT && PCI
-       select TRACING
+       select GENERIC_TRACER
         help
           Mmiotrace traces Memory Mapped I/O access and is meant for
           debugging and reverse engineering. It is called from the ioremap
@@ -416,7 +477,23 @@ config MMIOTRACE_TEST
  
           Say N, unless you absolutely know what you are doing.
  
-endmenu
+config RING_BUFFER_BENCHMARK
+       tristate "Ring buffer benchmark stress tester"
+       depends on RING_BUFFER
+       help
+         This option creates a test to stress the ring buffer and bench mark it.
+         It creates its own ring buffer such that it will not interfer with
+         any other users of the ring buffer (such as ftrace). It then creates
+         a producer and consumer that will run for 10 seconds and sleep for
+         10 seconds. Each interval it will print out the number of events
+         it recorded and give a rough estimate of how long each iteration took.
+
+         It does not disable interrupts or raise its priority, so it may be
+         affected by processes that are running.
+
+         If unsure, say N
+
+endif # FTRACE
  
  endif # TRACING_SUPPORT
  
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile

index 2630f5121ec12467a4b05958670aac40f357dcb8..844164dca90ae1ec9039ccfd66c5a89aaa8d1a3d 100644 (file)
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -15,11 +15,17 @@ ifdef CONFIG_TRACING_BRANCHES
  KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
  endif
  
+#
+# Make the trace clocks available generally: it's infrastructure
+# relied on by ptrace for example:
+#
+obj-y += trace_clock.o
+
  obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
  obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
+obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
  
  obj-$(CONFIG_TRACING) += trace.o
-obj-$(CONFIG_TRACING) += trace_clock.o
  obj-$(CONFIG_TRACING) += trace_output.o
  obj-$(CONFIG_TRACING) += trace_stat.o
  obj-$(CONFIG_TRACING) += trace_printk.o
@@ -39,12 +45,14 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
  obj-$(CONFIG_POWER_TRACER) += trace_power.o
  obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
  obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
-obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
-obj-$(CONFIG_EVENT_TRACER) += trace_events.o
-obj-$(CONFIG_EVENT_TRACER) += events.o
-obj-$(CONFIG_EVENT_TRACER) += trace_export.o
+obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
+ifeq ($(CONFIG_BLOCK),y)
+obj-$(CONFIG_EVENT_TRACING) += blktrace.o
+endif
+obj-$(CONFIG_EVENT_TRACING) += trace_events.o
+obj-$(CONFIG_EVENT_TRACING) += trace_export.o
  obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
  obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
-obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o
+obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
  
  libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c

index 921ef5d1f0ba95e7497faa55afb293c50ae7ee47..7bd6a9893c247e875098a57f640c664fba7783c8 100644 (file)
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,10 +23,14 @@
  #include <linux/mutex.h>
  #include <linux/debugfs.h>
  #include <linux/time.h>
-#include <trace/block.h>
  #include <linux/uaccess.h>
+
+#include <trace/events/block.h>
+
  #include "trace_output.h"
  
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+
  static unsigned int blktrace_seq __read_mostly = 1;
  
  static struct trace_array *blk_tr;
@@ -147,7 +151,7 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
  {
         if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
                 return 1;
-       if (sector < bt->start_lba || sector > bt->end_lba)
+       if (sector && (sector < bt->start_lba || sector > bt->end_lba))
                 return 1;
         if (bt->pid && pid != bt->pid)
                 return 1;
@@ -192,7 +196,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
         what |= MASK_TC_BIT(rw, DISCARD);
  
         pid = tsk->pid;
-       if (unlikely(act_log_check(bt, what, sector, pid)))
+       if (act_log_check(bt, what, sector, pid))
                 return;
         cpu = raw_smp_processor_id();
  
@@ -262,6 +266,7 @@ static void blk_trace_free(struct blk_trace *bt)
  {
         debugfs_remove(bt->msg_file);
         debugfs_remove(bt->dropped_file);
+       debugfs_remove(bt->dir);
         relay_close(bt->rchan);
         free_percpu(bt->sequence);
         free_percpu(bt->msg_data);
@@ -403,11 +408,29 @@ static struct rchan_callbacks blk_relay_callbacks = {
         .remove_buf_file        = blk_remove_buf_file_callback,
  };
  
+static void blk_trace_setup_lba(struct blk_trace *bt,
+                               struct block_device *bdev)
+{
+       struct hd_struct *part = NULL;
+
+       if (bdev)
+               part = bdev->bd_part;
+
+       if (part) {
+               bt->start_lba = part->start_sect;
+               bt->end_lba = part->start_sect + part->nr_sects;
+       } else {
+               bt->start_lba = 0;
+               bt->end_lba = -1ULL;
+       }
+}
+
  /*
   * Setup everything required to start tracing
   */
  int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
-                       struct blk_user_trace_setup *buts)
+                      struct block_device *bdev,
+                      struct blk_user_trace_setup *buts)
  {
         struct blk_trace *old_bt, *bt = NULL;
         struct dentry *dir = NULL;
@@ -480,10 +503,13 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
         if (!bt->act_mask)
                 bt->act_mask = (u16) -1;
  
-       bt->start_lba = buts->start_lba;
-       bt->end_lba = buts->end_lba;
-       if (!bt->end_lba)
-               bt->end_lba = -1ULL;
+       blk_trace_setup_lba(bt, bdev);
+
+       /* overwrite with user settings */
+       if (buts->start_lba)
+               bt->start_lba = buts->start_lba;
+       if (buts->end_lba)
+               bt->end_lba = buts->end_lba;
  
         bt->pid = buts->pid;
         bt->trace_state = Blktrace_setup;
@@ -505,6 +531,7 @@ err:
  }
  
  int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+                   struct block_device *bdev,
                     char __user *arg)
  {
         struct blk_user_trace_setup buts;
@@ -514,7 +541,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
         if (ret)
                 return -EFAULT;
  
-       ret = do_blk_trace_setup(q, name, dev, &buts);
+       ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
         if (ret)
                 return ret;
  
@@ -582,7 +609,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
         switch (cmd) {
         case BLKTRACESETUP:
                 bdevname(bdev, b);
-               ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
+               ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
                 break;
         case BLKTRACESTART:
                 start = 1;
@@ -809,7 +836,6 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
   * @bio:       the source bio
   * @dev:       target device
   * @from:      source sector
- * @to:                target sector
   *
   * Description:
   *     Device mapper or raid target sometimes need to split a bio because
@@ -817,7 +843,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
   *
   **/
  static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-                                      dev_t dev, sector_t from, sector_t to)
+                                      dev_t dev, sector_t from)
  {
         struct blk_trace *bt = q->blk_trace;
         struct blk_io_trace_remap r;
@@ -825,12 +851,13 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
         if (likely(!bt))
                 return;
  
-       r.device = cpu_to_be32(dev);
-       r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
-       r.sector = cpu_to_be64(to);
+       r.device_from = cpu_to_be32(dev);
+       r.device_to   = cpu_to_be32(bio->bi_bdev->bd_dev);
+       r.sector_from = cpu_to_be64(from);
  
-       __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
-                       !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+       __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
+                       BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE),
+                       sizeof(r), &r);
  }
  
  /**
@@ -971,6 +998,16 @@ static inline const void *pdu_start(const struct trace_entry *ent)
         return te_blk_io_trace(ent) + 1;
  }
  
+static inline u32 t_action(const struct trace_entry *ent)
+{
+       return te_blk_io_trace(ent)->action;
+}
+
+static inline u32 t_bytes(const struct trace_entry *ent)
+{
+       return te_blk_io_trace(ent)->bytes;
+}
+
  static inline u32 t_sec(const struct trace_entry *ent)
  {
         return te_blk_io_trace(ent)->bytes >> 9;
@@ -996,11 +1033,11 @@ static void get_pdu_remap(const struct trace_entry *ent,
                           struct blk_io_trace_remap *r)
  {
         const struct blk_io_trace_remap *__r = pdu_start(ent);
-       __u64 sector = __r->sector;
+       __u64 sector_from = __r->sector_from;
  
-       r->device = be32_to_cpu(__r->device);
         r->device_from = be32_to_cpu(__r->device_from);
-       r->sector = be64_to_cpu(sector);
+       r->device_to   = be32_to_cpu(__r->device_to);
+       r->sector_from = be64_to_cpu(sector_from);
  }
  
  typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
@@ -1031,36 +1068,98 @@ static int blk_log_action(struct trace_iterator *iter, const char *act)
                                 MAJOR(t->device), MINOR(t->device), act, rwbs);
  }
  
+static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+{
+       const unsigned char *pdu_buf;
+       int pdu_len;
+       int i, end, ret;
+
+       pdu_buf = pdu_start(ent);
+       pdu_len = te_blk_io_trace(ent)->pdu_len;
+
+       if (!pdu_len)
+               return 1;
+
+       /* find the last zero that needs to be printed */
+       for (end = pdu_len - 1; end >= 0; end--)
+               if (pdu_buf[end])
+                       break;
+       end++;
+
+       if (!trace_seq_putc(s, '('))
+               return 0;
+
+       for (i = 0; i < pdu_len; i++) {
+
+               ret = trace_seq_printf(s, "%s%02x",
+                                      i == 0 ? "" : " ", pdu_buf[i]);
+               if (!ret)
+                       return ret;
+
+               /*
+                * stop when the rest is just zeroes and indicate so
+                * with a ".." appended
+                */
+               if (i == end && end != pdu_len - 1)
+                       return trace_seq_puts(s, " ..) ");
+       }
+
+       return trace_seq_puts(s, ") ");
+}
+
  static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
  {
         char cmd[TASK_COMM_LEN];
  
         trace_find_cmdline(ent->pid, cmd);
  
-       if (t_sec(ent))
-               return trace_seq_printf(s, "%llu + %u [%s]\n",
-                                       t_sector(ent), t_sec(ent), cmd);
-       return trace_seq_printf(s, "[%s]\n", cmd);
+       if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
+               int ret;
+
+               ret = trace_seq_printf(s, "%u ", t_bytes(ent));
+               if (!ret)
+                       return 0;
+               ret = blk_log_dump_pdu(s, ent);
+               if (!ret)
+                       return 0;
+               return trace_seq_printf(s, "[%s]\n", cmd);
+       } else {
+               if (t_sec(ent))
+                       return trace_seq_printf(s, "%llu + %u [%s]\n",
+                                               t_sector(ent), t_sec(ent), cmd);
+               return trace_seq_printf(s, "[%s]\n", cmd);
+       }
  }
  
  static int blk_log_with_error(struct trace_seq *s,
                               const struct trace_entry *ent)
  {
-       if (t_sec(ent))
-               return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
-                                       t_sec(ent), t_error(ent));
-       return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
+       if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
+               int ret;
+
+               ret = blk_log_dump_pdu(s, ent);
+               if (ret)
+                       return trace_seq_printf(s, "[%d]\n", t_error(ent));
+               return 0;
+       } else {
+               if (t_sec(ent))
+                       return trace_seq_printf(s, "%llu + %u [%d]\n",
+                                               t_sector(ent),
+                                               t_sec(ent), t_error(ent));
+               return trace_seq_printf(s, "%llu [%d]\n",
+                                       t_sector(ent), t_error(ent));
+       }
  }
  
  static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
  {
-       struct blk_io_trace_remap r = { .device = 0, };
+       struct blk_io_trace_remap r = { .device_from = 0, };
  
         get_pdu_remap(ent, &r);
         return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
-                              t_sector(ent),
-                              t_sec(ent), MAJOR(r.device), MINOR(r.device),
-                              (unsigned long long)r.sector);
+                               t_sector(ent), t_sec(ent),
+                               MAJOR(r.device_from), MINOR(r.device_from),
+                               (unsigned long long)r.sector_from);
  }
  
  static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
@@ -1117,7 +1216,6 @@ static void blk_tracer_print_header(struct seq_file *m)
  static void blk_tracer_start(struct trace_array *tr)
  {
         blk_tracer_enabled = true;
-       trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
  }
  
  static int blk_tracer_init(struct trace_array *tr)
@@ -1130,7 +1228,6 @@ static int blk_tracer_init(struct trace_array *tr)
  static void blk_tracer_stop(struct trace_array *tr)
  {
         blk_tracer_enabled = false;
-       trace_flags |= TRACE_ITER_CONTEXT_INFO;
  }
  
  static void blk_tracer_reset(struct trace_array *tr)
@@ -1182,7 +1279,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
         }
  
         if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
-               ret = trace_seq_printf(s, "Bad pc action %x\n", what);
+               ret = trace_seq_printf(s, "Unknown action %x\n", what);
         else {
                 ret = log_action(iter, what2act[what].act[long_act]);
                 if (ret)
@@ -1195,9 +1292,6 @@ out:
  static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
                                                int flags)
  {
-       if (!trace_print_context(iter))
-               return TRACE_TYPE_PARTIAL_LINE;
-
         return print_one_line(iter, false);
  }
  
@@ -1232,6 +1326,18 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
         return print_one_line(iter, true);
  }
  
+static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set)
+{
+       /* don't output context-info for blk_classic output */
+       if (bit == TRACE_BLK_OPT_CLASSIC) {
+               if (set)
+                       trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
+               else
+                       trace_flags |= TRACE_ITER_CONTEXT_INFO;
+       }
+       return 0;
+}
+
  static struct tracer blk_tracer __read_mostly = {
         .name           = "blk",
         .init           = blk_tracer_init,
@@ -1241,6 +1347,7 @@ static struct tracer blk_tracer __read_mostly = {
         .print_header   = blk_tracer_print_header,
         .print_line     = blk_tracer_print_line,
         .flags          = &blk_tracer_flags,
+       .set_flag       = blk_tracer_set_flag,
  };
  
  static struct trace_event trace_blk_event = {
@@ -1285,7 +1392,8 @@ static int blk_trace_remove_queue(struct request_queue *q)
  /*
   * Setup everything required to start tracing
   */
-static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
+static int blk_trace_setup_queue(struct request_queue *q,
+                                struct block_device *bdev)
  {
         struct blk_trace *old_bt, *bt = NULL;
         int ret = -ENOMEM;
@@ -1298,9 +1406,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
         if (!bt->msg_data)
                 goto free_bt;
  
-       bt->dev = dev;
+       bt->dev = bdev->bd_dev;
         bt->act_mask = (u16)-1;
-       bt->end_lba = -1ULL;
+
+       blk_trace_setup_lba(bt, bdev);
  
         old_bt = xchg(&q->blk_trace, bt);
         if (old_bt != NULL) {
@@ -1517,7 +1626,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
  
         if (attr == &dev_attr_enable) {
                 if (value)
-                       ret = blk_trace_setup_queue(q, bdev->bd_dev);
+                       ret = blk_trace_setup_queue(q, bdev);
                 else
                         ret = blk_trace_remove_queue(q);
                 goto out_unlock_bdev;
@@ -1525,7 +1634,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
  
         ret = 0;
         if (q->blk_trace == NULL)
-               ret = blk_trace_setup_queue(q, bdev->bd_dev);
+               ret = blk_trace_setup_queue(q, bdev);
  
         if (ret == 0) {
                 if (attr == &dev_attr_act_mask)
@@ -1548,3 +1657,80 @@ out:
         return ret ? ret : count;
  }
  
+int blk_trace_init_sysfs(struct device *dev)
+{
+       return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
+}
+
+#endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#ifdef CONFIG_EVENT_TRACING
+
+void blk_dump_cmd(char *buf, struct request *rq)
+{
+       int i, end;
+       int len = rq->cmd_len;
+       unsigned char *cmd = rq->cmd;
+
+       if (!blk_pc_request(rq)) {
+               buf[0] = '\0';
+               return;
+       }
+
+       for (end = len - 1; end >= 0; end--)
+               if (cmd[end])
+                       break;
+       end++;
+
+       for (i = 0; i < len; i++) {
+               buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
+               if (i == end && end != len - 1) {
+                       sprintf(buf, " ..");
+                       break;
+               }
+       }
+}
+
+void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+{
+       int i = 0;
+
+       if (rw & WRITE)
+               rwbs[i++] = 'W';
+       else if (rw & 1 << BIO_RW_DISCARD)
+               rwbs[i++] = 'D';
+       else if (bytes)
+               rwbs[i++] = 'R';
+       else
+               rwbs[i++] = 'N';
+
+       if (rw & 1 << BIO_RW_AHEAD)
+               rwbs[i++] = 'A';
+       if (rw & 1 << BIO_RW_BARRIER)
+               rwbs[i++] = 'B';
+       if (rw & 1 << BIO_RW_SYNCIO)
+               rwbs[i++] = 'S';
+       if (rw & 1 << BIO_RW_META)
+               rwbs[i++] = 'M';
+
+       rwbs[i] = '\0';
+}
+
+void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
+{
+       int rw = rq->cmd_flags & 0x03;
+       int bytes;
+
+       if (blk_discard_rq(rq))
+               rw |= (1 << BIO_RW_DISCARD);
+
+       if (blk_pc_request(rq))
+               bytes = rq->data_len;
+       else
+               bytes = rq->hard_nr_sectors << 9;
+
+       blk_fill_rwbs(rwbs, rw, bytes);
+}
+
+#endif /* CONFIG_EVENT_TRACING */
+
diff --git a/kernel/trace/events.c b/kernel/trace/events.c

deleted file mode 100644 (file)

index 246f2aa..0000000
--- a/kernel/trace/events.c
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * This is the place to register all trace points as events.
- */
-
-#include <linux/stringify.h>
-
-#include <trace/trace_events.h>
-
-#include "trace_output.h"
-
-#include "trace_events_stage_1.h"
-#include "trace_events_stage_2.h"
-#include "trace_events_stage_3.h"
-
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c

index f1ed080406c31f6bf61025e34e626f71b2865ad6..bb60732ade0cc7bca54fdbff32a9dfda16bacde9 100644 (file)
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -29,11 +29,13 @@
  #include <linux/list.h>
  #include <linux/hash.h>
  
-#include <trace/sched.h>
+#include <trace/events/sched.h>
  
  #include <asm/ftrace.h>
+#include <asm/setup.h>
  
-#include "trace.h"
+#include "trace_output.h"
+#include "trace_stat.h"
  
  #define FTRACE_WARN_ON(cond)                   \
         do {                                    \
@@ -68,7 +70,7 @@ static DEFINE_MUTEX(ftrace_lock);
  
  static struct ftrace_ops ftrace_list_end __read_mostly =
  {
-       .func = ftrace_stub,
+       .func           = ftrace_stub,
  };
  
  static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
@@ -240,6 +242,580 @@ static void ftrace_update_pid_func(void)
  #endif
  }
  
+#ifdef CONFIG_FUNCTION_PROFILER
+struct ftrace_profile {
+       struct hlist_node               node;
+       unsigned long                   ip;
+       unsigned long                   counter;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       unsigned long long              time;
+#endif
+};
+
+struct ftrace_profile_page {
+       struct ftrace_profile_page      *next;
+       unsigned long                   index;
+       struct ftrace_profile           records[];
+};
+
+struct ftrace_profile_stat {
+       atomic_t                        disabled;
+       struct hlist_head               *hash;
+       struct ftrace_profile_page      *pages;
+       struct ftrace_profile_page      *start;
+       struct tracer_stat              stat;
+};
+
+#define PROFILE_RECORDS_SIZE                                           \
+       (PAGE_SIZE - offsetof(struct ftrace_profile_page, records))
+
+#define PROFILES_PER_PAGE                                      \
+       (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
+
+static int ftrace_profile_bits __read_mostly;
+static int ftrace_profile_enabled __read_mostly;
+
+/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
+static DEFINE_MUTEX(ftrace_profile_lock);
+
+static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
+
+#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
+
+static void *
+function_stat_next(void *v, int idx)
+{
+       struct ftrace_profile *rec = v;
+       struct ftrace_profile_page *pg;
+
+       pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
+
+ again:
+       rec++;
+       if ((void *)rec >= (void *)&pg->records[pg->index]) {
+               pg = pg->next;
+               if (!pg)
+                       return NULL;
+               rec = &pg->records[0];
+               if (!rec->counter)
+                       goto again;
+       }
+
+       return rec;
+}
+
+static void *function_stat_start(struct tracer_stat *trace)
+{
+       struct ftrace_profile_stat *stat =
+               container_of(trace, struct ftrace_profile_stat, stat);
+
+       if (!stat || !stat->start)
+               return NULL;
+
+       return function_stat_next(&stat->start->records[0], 0);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/* function graph compares on total time */
+static int function_stat_cmp(void *p1, void *p2)
+{
+       struct ftrace_profile *a = p1;
+       struct ftrace_profile *b = p2;
+
+       if (a->time < b->time)
+               return -1;
+       if (a->time > b->time)
+               return 1;
+       else
+               return 0;
+}
+#else
+/* not function graph compares against hits */
+static int function_stat_cmp(void *p1, void *p2)
+{
+       struct ftrace_profile *a = p1;
+       struct ftrace_profile *b = p2;
+
+       if (a->counter < b->counter)
+               return -1;
+       if (a->counter > b->counter)
+               return 1;
+       else
+               return 0;
+}
+#endif
+
+static int function_stat_headers(struct seq_file *m)
+{
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       seq_printf(m, "  Function                               "
+                  "Hit    Time            Avg\n"
+                     "  --------                               "
+                  "---    ----            ---\n");
+#else
+       seq_printf(m, "  Function                               Hit\n"
+                     "  --------                               ---\n");
+#endif
+       return 0;
+}
+
+static int function_stat_show(struct seq_file *m, void *v)
+{
+       struct ftrace_profile *rec = v;
+       char str[KSYM_SYMBOL_LEN];
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       static DEFINE_MUTEX(mutex);
+       static struct trace_seq s;
+       unsigned long long avg;
+#endif
+
+       kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+       seq_printf(m, "  %-30.30s  %10lu", str, rec->counter);
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       seq_printf(m, "    ");
+       avg = rec->time;
+       do_div(avg, rec->counter);
+
+       mutex_lock(&mutex);
+       trace_seq_init(&s);
+       trace_print_graph_duration(rec->time, &s);
+       trace_seq_puts(&s, "    ");
+       trace_print_graph_duration(avg, &s);
+       trace_print_seq(m, &s);
+       mutex_unlock(&mutex);
+#endif
+       seq_putc(m, '\n');
+
+       return 0;
+}
+
+static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
+{
+       struct ftrace_profile_page *pg;
+
+       pg = stat->pages = stat->start;
+
+       while (pg) {
+               memset(pg->records, 0, PROFILE_RECORDS_SIZE);
+               pg->index = 0;
+               pg = pg->next;
+       }
+
+       memset(stat->hash, 0,
+              FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
+}
+
+int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
+{
+       struct ftrace_profile_page *pg;
+       int functions;
+       int pages;
+       int i;
+
+       /* If we already allocated, do nothing */
+       if (stat->pages)
+               return 0;
+
+       stat->pages = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!stat->pages)
+               return -ENOMEM;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+       functions = ftrace_update_tot_cnt;
+#else
+       /*
+        * We do not know the number of functions that exist because
+        * dynamic tracing is what counts them. With past experience
+        * we have around 20K functions. That should be more than enough.
+        * It is highly unlikely we will execute every function in
+        * the kernel.
+        */
+       functions = 20000;
+#endif
+
+       pg = stat->start = stat->pages;
+
+       pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
+
+       for (i = 0; i < pages; i++) {
+               pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+               if (!pg->next)
+                       goto out_free;
+               pg = pg->next;
+       }
+
+       return 0;
+
+ out_free:
+       pg = stat->start;
+       while (pg) {
+               unsigned long tmp = (unsigned long)pg;
+
+               pg = pg->next;
+               free_page(tmp);
+       }
+
+       free_page((unsigned long)stat->pages);
+       stat->pages = NULL;
+       stat->start = NULL;
+
+       return -ENOMEM;
+}
+
+static int ftrace_profile_init_cpu(int cpu)
+{
+       struct ftrace_profile_stat *stat;
+       int size;
+
+       stat = &per_cpu(ftrace_profile_stats, cpu);
+
+       if (stat->hash) {
+               /* If the profile is already created, simply reset it */
+               ftrace_profile_reset(stat);
+               return 0;
+       }
+
+       /*
+        * We are profiling all functions, but usually only a few thousand
+        * functions are hit. We'll make a hash of 1024 items.
+        */
+       size = FTRACE_PROFILE_HASH_SIZE;
+
+       stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
+
+       if (!stat->hash)
+               return -ENOMEM;
+
+       if (!ftrace_profile_bits) {
+               size--;
+
+               for (; size; size >>= 1)
+                       ftrace_profile_bits++;
+       }
+
+       /* Preallocate the function profiling pages */
+       if (ftrace_profile_pages_init(stat) < 0) {
+               kfree(stat->hash);
+               stat->hash = NULL;
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static int ftrace_profile_init(void)
+{
+       int cpu;
+       int ret = 0;
+
+       for_each_online_cpu(cpu) {
+               ret = ftrace_profile_init_cpu(cpu);
+               if (ret)
+                       break;
+       }
+
+       return ret;
+}
+
+/* interrupts must be disabled */
+static struct ftrace_profile *
+ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
+{
+       struct ftrace_profile *rec;
+       struct hlist_head *hhd;
+       struct hlist_node *n;
+       unsigned long key;
+
+       key = hash_long(ip, ftrace_profile_bits);
+       hhd = &stat->hash[key];
+
+       if (hlist_empty(hhd))
+               return NULL;
+
+       hlist_for_each_entry_rcu(rec, n, hhd, node) {
+               if (rec->ip == ip)
+                       return rec;
+       }
+
+       return NULL;
+}
+
+static void ftrace_add_profile(struct ftrace_profile_stat *stat,
+                              struct ftrace_profile *rec)
+{
+       unsigned long key;
+
+       key = hash_long(rec->ip, ftrace_profile_bits);
+       hlist_add_head_rcu(&rec->node, &stat->hash[key]);
+}
+
+/*
+ * The memory is already allocated, this simply finds a new record to use.
+ */
+static struct ftrace_profile *
+ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
+{
+       struct ftrace_profile *rec = NULL;
+
+       /* prevent recursion (from NMIs) */
+       if (atomic_inc_return(&stat->disabled) != 1)
+               goto out;
+
+       /*
+        * Try to find the function again since an NMI
+        * could have added it
+        */
+       rec = ftrace_find_profiled_func(stat, ip);
+       if (rec)
+               goto out;
+
+       if (stat->pages->index == PROFILES_PER_PAGE) {
+               if (!stat->pages->next)
+                       goto out;
+               stat->pages = stat->pages->next;
+       }
+
+       rec = &stat->pages->records[stat->pages->index++];
+       rec->ip = ip;
+       ftrace_add_profile(stat, rec);
+
+ out:
+       atomic_dec(&stat->disabled);
+
+       return rec;
+}
+
+static void
+function_profile_call(unsigned long ip, unsigned long parent_ip)
+{
+       struct ftrace_profile_stat *stat;
+       struct ftrace_profile *rec;
+       unsigned long flags;
+
+       if (!ftrace_profile_enabled)
+               return;
+
+       local_irq_save(flags);
+
+       stat = &__get_cpu_var(ftrace_profile_stats);
+       if (!stat->hash || !ftrace_profile_enabled)
+               goto out;
+
+       rec = ftrace_find_profiled_func(stat, ip);
+       if (!rec) {
+               rec = ftrace_profile_alloc(stat, ip);
+               if (!rec)
+                       goto out;
+       }
+
+       rec->counter++;
+ out:
+       local_irq_restore(flags);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int profile_graph_entry(struct ftrace_graph_ent *trace)
+{
+       function_profile_call(trace->func, 0);
+       return 1;
+}
+
+static void profile_graph_return(struct ftrace_graph_ret *trace)
+{
+       struct ftrace_profile_stat *stat;
+       unsigned long long calltime;
+       struct ftrace_profile *rec;
+       unsigned long flags;
+
+       local_irq_save(flags);
+       stat = &__get_cpu_var(ftrace_profile_stats);
+       if (!stat->hash || !ftrace_profile_enabled)
+               goto out;
+
+       calltime = trace->rettime - trace->calltime;
+
+       if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
+               int index;
+
+               index = trace->depth;
+
+               /* Append this call time to the parent time to subtract */
+               if (index)
+                       current->ret_stack[index - 1].subtime += calltime;
+
+               if (current->ret_stack[index].subtime < calltime)
+                       calltime -= current->ret_stack[index].subtime;
+               else
+                       calltime = 0;
+       }
+
+       rec = ftrace_find_profiled_func(stat, trace->func);
+       if (rec)
+               rec->time += calltime;
+
+ out:
+       local_irq_restore(flags);
+}
+
+static int register_ftrace_profiler(void)
+{
+       return register_ftrace_graph(&profile_graph_return,
+                                    &profile_graph_entry);
+}
+
+static void unregister_ftrace_profiler(void)
+{
+       unregister_ftrace_graph();
+}
+#else
+static struct ftrace_ops ftrace_profile_ops __read_mostly =
+{
+       .func           = function_profile_call,
+};
+
+static int register_ftrace_profiler(void)
+{
+       return register_ftrace_function(&ftrace_profile_ops);
+}
+
+static void unregister_ftrace_profiler(void)
+{
+       unregister_ftrace_function(&ftrace_profile_ops);
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+static ssize_t
+ftrace_profile_write(struct file *filp, const char __user *ubuf,
+                    size_t cnt, loff_t *ppos)
+{
+       unsigned long val;
+       char buf[64];           /* big enough to hold a number */
+       int ret;
+
+       if (cnt >= sizeof(buf))
+               return -EINVAL;
+
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+
+       buf[cnt] = 0;
+
+       ret = strict_strtoul(buf, 10, &val);
+       if (ret < 0)
+               return ret;
+
+       val = !!val;
+
+       mutex_lock(&ftrace_profile_lock);
+       if (ftrace_profile_enabled ^ val) {
+               if (val) {
+                       ret = ftrace_profile_init();
+                       if (ret < 0) {
+                               cnt = ret;
+                               goto out;
+                       }
+
+                       ret = register_ftrace_profiler();
+                       if (ret < 0) {
+                               cnt = ret;
+                               goto out;
+                       }
+                       ftrace_profile_enabled = 1;
+               } else {
+                       ftrace_profile_enabled = 0;
+                       /*
+                        * unregister_ftrace_profiler calls stop_machine
+                        * so this acts like an synchronize_sched.
+                        */
+                       unregister_ftrace_profiler();
+               }
+       }
+ out:
+       mutex_unlock(&ftrace_profile_lock);
+
+       filp->f_pos += cnt;
+
+       return cnt;
+}
+
+static ssize_t
+ftrace_profile_read(struct file *filp, char __user *ubuf,
+                    size_t cnt, loff_t *ppos)
+{
+       char buf[64];           /* big enough to hold a number */
+       int r;
+
+       r = sprintf(buf, "%u\n", ftrace_profile_enabled);
+       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static const struct file_operations ftrace_profile_fops = {
+       .open           = tracing_open_generic,
+       .read           = ftrace_profile_read,
+       .write          = ftrace_profile_write,
+};
+
+/* used to initialize the real stat files */
+static struct tracer_stat function_stats __initdata = {
+       .name           = "functions",
+       .stat_start     = function_stat_start,
+       .stat_next      = function_stat_next,
+       .stat_cmp       = function_stat_cmp,
+       .stat_headers   = function_stat_headers,
+       .stat_show      = function_stat_show
+};
+
+static void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+       struct ftrace_profile_stat *stat;
+       struct dentry *entry;
+       char *name;
+       int ret;
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               stat = &per_cpu(ftrace_profile_stats, cpu);
+
+               /* allocate enough for function name + cpu number */
+               name = kmalloc(32, GFP_KERNEL);
+               if (!name) {
+                       /*
+                        * The files created are permanent, if something happens
+                        * we still do not free memory.
+                        */
+                       kfree(stat);
+                       WARN(1,
+                            "Could not allocate stat file for cpu %d\n",
+                            cpu);
+                       return;
+               }
+               stat->stat = function_stats;
+               snprintf(name, 32, "function%d", cpu);
+               stat->stat.name = name;
+               ret = register_stat_tracer(&stat->stat);
+               if (ret) {
+                       WARN(1,
+                            "Could not register function stat for cpu %d\n",
+                            cpu);
+                       kfree(name);
+                       return;
+               }
+       }
+
+       entry = debugfs_create_file("function_profile_enabled", 0644,
+                                   d_tracer, NULL, &ftrace_profile_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs "
+                          "'function_profile_enabled' entry\n");
+}
+
+#else /* CONFIG_FUNCTION_PROFILER */
+static void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+}
+#endif /* CONFIG_FUNCTION_PROFILER */
+
  /* set when tracing only a pid */
  struct pid *ftrace_pid_trace;
  static struct pid * const ftrace_swapper_pid = &init_struct_pid;
@@ -261,7 +837,6 @@ struct ftrace_func_probe {
         struct rcu_head         rcu;
  };
  
-
  enum {
         FTRACE_ENABLE_CALLS             = (1 << 0),
         FTRACE_DISABLE_CALLS            = (1 << 1),
@@ -346,30 +921,6 @@ static void ftrace_free_rec(struct dyn_ftrace *rec)
         rec->flags |= FTRACE_FL_FREE;
  }
  
-void ftrace_release(void *start, unsigned long size)
-{
-       struct dyn_ftrace *rec;
-       struct ftrace_page *pg;
-       unsigned long s = (unsigned long)start;
-       unsigned long e = s + size;
-
-       if (ftrace_disabled || !start)
-               return;
-
-       mutex_lock(&ftrace_lock);
-       do_for_each_ftrace_rec(pg, rec) {
-               if ((rec->ip >= s) && (rec->ip < e)) {
-                       /*
-                        * rec->ip is changed in ftrace_free_rec()
-                        * It should not between s and e if record was freed.
-                        */
-                       FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
-                       ftrace_free_rec(rec);
-               }
-       } while_for_each_ftrace_rec();
-       mutex_unlock(&ftrace_lock);
-}
-
  static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
  {
         struct dyn_ftrace *rec;
@@ -1408,7 +1959,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
  
  static struct ftrace_ops trace_probe_ops __read_mostly =
  {
-       .func = function_trace_probe_call,
+       .func           = function_trace_probe_call,
  };
  
  static int ftrace_probe_registered;
@@ -1823,6 +2374,45 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)
         ftrace_set_regex(buf, len, reset, 0);
  }
  
+/*
+ * command line interface to allow users to set filters on boot up.
+ */
+#define FTRACE_FILTER_SIZE             COMMAND_LINE_SIZE
+static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
+static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
+
+static int __init set_ftrace_notrace(char *str)
+{
+       strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+       return 1;
+}
+__setup("ftrace_notrace=", set_ftrace_notrace);
+
+static int __init set_ftrace_filter(char *str)
+{
+       strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+       return 1;
+}
+__setup("ftrace_filter=", set_ftrace_filter);
+
+static void __init set_ftrace_early_filter(char *buf, int enable)
+{
+       char *func;
+
+       while (buf) {
+               func = strsep(&buf, ",");
+               ftrace_set_regex(func, strlen(func), 0, enable);
+       }
+}
+
+static void __init set_ftrace_early_filters(void)
+{
+       if (ftrace_filter_buf[0])
+               set_ftrace_early_filter(ftrace_filter_buf, 1);
+       if (ftrace_notrace_buf[0])
+               set_ftrace_early_filter(ftrace_notrace_buf, 0);
+}
+
  static int
  ftrace_regex_release(struct inode *inode, struct file *file, int enable)
  {
@@ -2128,38 +2718,23 @@ static const struct file_operations ftrace_graph_fops = {
  
  static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
  {
-       struct dentry *entry;
  
-       entry = debugfs_create_file("available_filter_functions", 0444,
-                                   d_tracer, NULL, &ftrace_avail_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'available_filter_functions' entry\n");
+       trace_create_file("available_filter_functions", 0444,
+                       d_tracer, NULL, &ftrace_avail_fops);
  
-       entry = debugfs_create_file("failures", 0444,
-                                   d_tracer, NULL, &ftrace_failures_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'failures' entry\n");
+       trace_create_file("failures", 0444,
+                       d_tracer, NULL, &ftrace_failures_fops);
  
-       entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
-                                   NULL, &ftrace_filter_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'set_ftrace_filter' entry\n");
+       trace_create_file("set_ftrace_filter", 0644, d_tracer,
+                       NULL, &ftrace_filter_fops);
  
-       entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
+       trace_create_file("set_ftrace_notrace", 0644, d_tracer,
                                     NULL, &ftrace_notrace_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'set_ftrace_notrace' entry\n");
  
  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-       entry = debugfs_create_file("set_graph_function", 0444, d_tracer,
+       trace_create_file("set_graph_function", 0444, d_tracer,
                                     NULL,
                                     &ftrace_graph_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'set_graph_function' entry\n");
  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
  
         return 0;
@@ -2197,14 +2772,72 @@ static int ftrace_convert_nops(struct module *mod,
         return 0;
  }
  
-void ftrace_init_module(struct module *mod,
-                       unsigned long *start, unsigned long *end)
+#ifdef CONFIG_MODULES
+void ftrace_release(void *start, void *end)
+{
+       struct dyn_ftrace *rec;
+       struct ftrace_page *pg;
+       unsigned long s = (unsigned long)start;
+       unsigned long e = (unsigned long)end;
+
+       if (ftrace_disabled || !start || start == end)
+               return;
+
+       mutex_lock(&ftrace_lock);
+       do_for_each_ftrace_rec(pg, rec) {
+               if ((rec->ip >= s) && (rec->ip < e)) {
+                       /*
+                        * rec->ip is changed in ftrace_free_rec()
+                        * It should not between s and e if record was freed.
+                        */
+                       FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
+                       ftrace_free_rec(rec);
+               }
+       } while_for_each_ftrace_rec();
+       mutex_unlock(&ftrace_lock);
+}
+
+static void ftrace_init_module(struct module *mod,
+                              unsigned long *start, unsigned long *end)
  {
         if (ftrace_disabled || start == end)
                 return;
         ftrace_convert_nops(mod, start, end);
  }
  
+static int ftrace_module_notify(struct notifier_block *self,
+                               unsigned long val, void *data)
+{
+       struct module *mod = data;
+
+       switch (val) {
+       case MODULE_STATE_COMING:
+               ftrace_init_module(mod, mod->ftrace_callsites,
+                                  mod->ftrace_callsites +
+                                  mod->num_ftrace_callsites);
+               break;
+       case MODULE_STATE_GOING:
+               ftrace_release(mod->ftrace_callsites,
+                              mod->ftrace_callsites +
+                              mod->num_ftrace_callsites);
+               break;
+       }
+
+       return 0;
+}
+#else
+static int ftrace_module_notify(struct notifier_block *self,
+                               unsigned long val, void *data)
+{
+       return 0;
+}
+#endif /* CONFIG_MODULES */
+
+struct notifier_block ftrace_module_nb = {
+       .notifier_call = ftrace_module_notify,
+       .priority = 0,
+};
+
  extern unsigned long __start_mcount_loc[];
  extern unsigned long __stop_mcount_loc[];
  
@@ -2236,6 +2869,12 @@ void __init ftrace_init(void)
                                   __start_mcount_loc,
                                   __stop_mcount_loc);
  
+       ret = register_module_notifier(&ftrace_module_nb);
+       if (ret)
+               pr_warning("Failed to register trace ftrace module notifier\n");
+
+       set_ftrace_early_filters();
+
         return;
   failed:
         ftrace_disabled = 1;
@@ -2417,7 +3056,6 @@ static const struct file_operations ftrace_pid_fops = {
  static __init int ftrace_init_debugfs(void)
  {
         struct dentry *d_tracer;
-       struct dentry *entry;
  
         d_tracer = tracing_init_dentry();
         if (!d_tracer)
@@ -2425,11 +3063,11 @@ static __init int ftrace_init_debugfs(void)
  
         ftrace_init_dyn_debugfs(d_tracer);
  
-       entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer,
-                                   NULL, &ftrace_pid_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'set_ftrace_pid' entry\n");
+       trace_create_file("set_ftrace_pid", 0644, d_tracer,
+                           NULL, &ftrace_pid_fops);
+
+       ftrace_profile_debugfs(d_tracer);
+
         return 0;
  }
  fs_initcall(ftrace_init_debugfs);
@@ -2538,7 +3176,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
  
  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
  
-static atomic_t ftrace_graph_active;
+static int ftrace_graph_active;
  static struct notifier_block ftrace_suspend_notifier;
  
  int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
@@ -2580,12 +3218,12 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
                 }
  
                 if (t->ret_stack == NULL) {
-                       t->curr_ret_stack = -1;
-                       /* Make sure IRQs see the -1 first: */
-                       barrier();
-                       t->ret_stack = ret_stack_list[start++];
                         atomic_set(&t->tracing_graph_pause, 0);
                         atomic_set(&t->trace_overrun, 0);
+                       t->curr_ret_stack = -1;
+                       /* Make sure the tasks see the -1 first: */
+                       smp_wmb();
+                       t->ret_stack = ret_stack_list[start++];
                 }
         } while_each_thread(g, t);
  
@@ -2643,8 +3281,10 @@ static int start_graph_tracing(void)
                 return -ENOMEM;
  
         /* The cpu_boot init_task->ret_stack will never be freed */
-       for_each_online_cpu(cpu)
-               ftrace_graph_init_task(idle_task(cpu));
+       for_each_online_cpu(cpu) {
+               if (!idle_task(cpu)->ret_stack)
+                       ftrace_graph_init_task(idle_task(cpu));
+       }
  
         do {
                 ret = alloc_retstack_tasklist(ret_stack_list);
@@ -2690,7 +3330,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
         mutex_lock(&ftrace_lock);
  
         /* we currently allow only one tracer registered at a time */
-       if (atomic_read(&ftrace_graph_active)) {
+       if (ftrace_graph_active) {
                 ret = -EBUSY;
                 goto out;
         }
@@ -2698,10 +3338,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
         ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
         register_pm_notifier(&ftrace_suspend_notifier);
  
-       atomic_inc(&ftrace_graph_active);
+       ftrace_graph_active++;
         ret = start_graph_tracing();
         if (ret) {
-               atomic_dec(&ftrace_graph_active);
+               ftrace_graph_active--;
                 goto out;
         }
  
@@ -2719,10 +3359,10 @@ void unregister_ftrace_graph(void)
  {
         mutex_lock(&ftrace_lock);
  
-       if (!unlikely(atomic_read(&ftrace_graph_active)))
+       if (unlikely(!ftrace_graph_active))
                 goto out;
  
-       atomic_dec(&ftrace_graph_active);
+       ftrace_graph_active--;
         unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
         ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
         ftrace_graph_entry = ftrace_graph_entry_stub;
@@ -2736,18 +3376,25 @@ void unregister_ftrace_graph(void)
  /* Allocate a return stack for newly created task */
  void ftrace_graph_init_task(struct task_struct *t)
  {
-       if (atomic_read(&ftrace_graph_active)) {
-               t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+       /* Make sure we do not use the parent ret_stack */
+       t->ret_stack = NULL;
+
+       if (ftrace_graph_active) {
+               struct ftrace_ret_stack *ret_stack;
+
+               ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
                                 * sizeof(struct ftrace_ret_stack),
                                 GFP_KERNEL);
-               if (!t->ret_stack)
+               if (!ret_stack)
                         return;
                 t->curr_ret_stack = -1;
                 atomic_set(&t->tracing_graph_pause, 0);
                 atomic_set(&t->trace_overrun, 0);
                 t->ftrace_timestamp = 0;
-       } else
-               t->ret_stack = NULL;
+               /* make curr_ret_stack visable before we add the ret_stack */
+               smp_wmb();
+               t->ret_stack = ret_stack;
+       }
  }
  
  void ftrace_graph_exit_task(struct task_struct *t)
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c

index 5011f4d91e375c2895cdc074299caafdf058ca38..86cdf671d7e288c0998a4b14e2b2f2f2c709fdb2 100644 (file)
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -12,7 +12,7 @@
  #include <linux/dcache.h>
  #include <linux/fs.h>
  
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
  
  #include "trace_output.h"
  #include "trace.h"
@@ -42,6 +42,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
                                    gfp_t gfp_flags,
                                    int node)
  {
+       struct ftrace_event_call *call = &event_kmem_alloc;
         struct trace_array *tr = kmemtrace_array;
         struct kmemtrace_alloc_entry *entry;
         struct ring_buffer_event *event;
@@ -62,7 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
         entry->gfp_flags        = gfp_flags;
         entry->node             = node;
  
-       ring_buffer_unlock_commit(tr->buffer, event);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
  
         trace_wake_up();
  }
@@ -71,6 +73,7 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
                                   unsigned long call_site,
                                   const void *ptr)
  {
+       struct ftrace_event_call *call = &event_kmem_free;
         struct trace_array *tr = kmemtrace_array;
         struct kmemtrace_free_entry *entry;
         struct ring_buffer_event *event;
@@ -86,7 +89,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
         entry->call_site        = call_site;
         entry->ptr              = ptr;
  
-       ring_buffer_unlock_commit(tr->buffer, event);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
  
         trace_wake_up();
  }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c

index 960cbf44c844a17dd156b25927d6e98c89840600..2e642b2b7253d11ddce40c657a471f007561be65 100644 (file)
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -21,6 +21,28 @@
  
  #include "trace.h"
  
+/*
+ * The ring buffer header is special. We must manually up keep it.
+ */
+int ring_buffer_print_entry_header(struct trace_seq *s)
+{
+       int ret;
+
+       ret = trace_seq_printf(s, "# compressed entry header\n");
+       ret = trace_seq_printf(s, "\ttype_len    :    5 bits\n");
+       ret = trace_seq_printf(s, "\ttime_delta  :   27 bits\n");
+       ret = trace_seq_printf(s, "\tarray       :   32 bits\n");
+       ret = trace_seq_printf(s, "\n");
+       ret = trace_seq_printf(s, "\tpadding     : type == %d\n",
+                              RINGBUF_TYPE_PADDING);
+       ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
+                              RINGBUF_TYPE_TIME_EXTEND);
+       ret = trace_seq_printf(s, "\tdata max type_len  == %d\n",
+                              RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
+
+       return ret;
+}
+
  /*
   * The ring buffer is made up of a list of pages. A separate list of pages is
   * allocated for each CPU. A writer may only write to a buffer that is
@@ -182,7 +204,10 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
  
  #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
  #define RB_ALIGNMENT           4U
-#define RB_MAX_SMALL_DATA      28
+#define RB_MAX_SMALL_DATA      (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
+
+/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
+#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
  
  enum {
         RB_LEN_TIME_EXTEND = 8,
@@ -191,48 +216,28 @@ enum {
  
  static inline int rb_null_event(struct ring_buffer_event *event)
  {
-       return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0;
+       return event->type_len == RINGBUF_TYPE_PADDING
+                       && event->time_delta == 0;
  }
  
  static inline int rb_discarded_event(struct ring_buffer_event *event)
  {
-       return event->type == RINGBUF_TYPE_PADDING && event->time_delta;
+       return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
  }
  
  static void rb_event_set_padding(struct ring_buffer_event *event)
  {
-       event->type = RINGBUF_TYPE_PADDING;
+       event->type_len = RINGBUF_TYPE_PADDING;
         event->time_delta = 0;
  }
  
-/**
- * ring_buffer_event_discard - discard an event in the ring buffer
- * @buffer: the ring buffer
- * @event: the event to discard
- *
- * Sometimes a event that is in the ring buffer needs to be ignored.
- * This function lets the user discard an event in the ring buffer
- * and then that event will not be read later.
- *
- * Note, it is up to the user to be careful with this, and protect
- * against races. If the user discards an event that has been consumed
- * it is possible that it could corrupt the ring buffer.
- */
-void ring_buffer_event_discard(struct ring_buffer_event *event)
-{
-       event->type = RINGBUF_TYPE_PADDING;
-       /* time delta must be non zero */
-       if (!event->time_delta)
-               event->time_delta = 1;
-}
-
  static unsigned
  rb_event_data_length(struct ring_buffer_event *event)
  {
         unsigned length;
  
-       if (event->len)
-               length = event->len * RB_ALIGNMENT;
+       if (event->type_len)
+               length = event->type_len * RB_ALIGNMENT;
         else
                 length = event->array[0];
         return length + RB_EVNT_HDR_SIZE;
@@ -242,12 +247,12 @@ rb_event_data_length(struct ring_buffer_event *event)
  static unsigned
  rb_event_length(struct ring_buffer_event *event)
  {
-       switch (event->type) {
+       switch (event->type_len) {
         case RINGBUF_TYPE_PADDING:
                 if (rb_null_event(event))
                         /* undefined */
                         return -1;
-               return rb_event_data_length(event);
+               return  event->array[0] + RB_EVNT_HDR_SIZE;
  
         case RINGBUF_TYPE_TIME_EXTEND:
                 return RB_LEN_TIME_EXTEND;
@@ -271,7 +276,7 @@ rb_event_length(struct ring_buffer_event *event)
  unsigned ring_buffer_event_length(struct ring_buffer_event *event)
  {
         unsigned length = rb_event_length(event);
-       if (event->type != RINGBUF_TYPE_DATA)
+       if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
                 return length;
         length -= RB_EVNT_HDR_SIZE;
         if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
@@ -284,9 +289,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
  static void *
  rb_event_data(struct ring_buffer_event *event)
  {
-       BUG_ON(event->type != RINGBUF_TYPE_DATA);
+       BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
         /* If length is in len field, then array[0] has the data */
-       if (event->len)
+       if (event->type_len)
                 return (void *)&event->array[0];
         /* Otherwise length is in array[0] and array[1] has the data */
         return (void *)&event->array[1];
@@ -316,9 +321,10 @@ struct buffer_data_page {
  };
  
  struct buffer_page {
+       struct list_head list;          /* list of buffer pages */
         local_t          write;         /* index for next write */
         unsigned         read;          /* index for next read */
-       struct list_head list;          /* list of free pages */
+       local_t          entries;       /* entries on this page */
         struct buffer_data_page *page;  /* Actual data page */
  };
  
@@ -361,6 +367,34 @@ static inline int test_time_stamp(u64 delta)
  
  #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
  
+/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
+#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
+
+/* Max number of timestamps that can fit on a page */
+#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
+
+int ring_buffer_print_page_header(struct trace_seq *s)
+{
+       struct buffer_data_page field;
+       int ret;
+
+       ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+                              "offset:0;\tsize:%u;\n",
+                              (unsigned int)sizeof(field.time_stamp));
+
+       ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
+                              "offset:%u;\tsize:%u;\n",
+                              (unsigned int)offsetof(typeof(field), commit),
+                              (unsigned int)sizeof(field.commit));
+
+       ret = trace_seq_printf(s, "\tfield: char data;\t"
+                              "offset:%u;\tsize:%u;\n",
+                              (unsigned int)offsetof(typeof(field), data),
+                              (unsigned int)BUF_PAGE_SIZE);
+
+       return ret;
+}
+
  /*
   * head_page == tail_page && head == tail then buffer is empty.
   */
@@ -375,8 +409,11 @@ struct ring_buffer_per_cpu {
         struct buffer_page              *tail_page;     /* write to tail */
         struct buffer_page              *commit_page;   /* committed pages */
         struct buffer_page              *reader_page;
+       unsigned long                   nmi_dropped;
+       unsigned long                   commit_overrun;
         unsigned long                   overrun;
-       unsigned long                   entries;
+       unsigned long                   read;
+       local_t                         entries;
         u64                             write_stamp;
         u64                             read_stamp;
         atomic_t                        record_disabled;
@@ -389,6 +426,8 @@ struct ring_buffer {
         atomic_t                        record_disabled;
         cpumask_var_t                   cpumask;
  
+       struct lock_class_key           *reader_lock_key;
+
         struct mutex                    mutex;
  
         struct ring_buffer_per_cpu      **buffers;
@@ -420,13 +459,18 @@ struct ring_buffer_iter {
  /* Up this if you want to test the TIME_EXTENTS and normalization */
  #define DEBUG_SHIFT 0
  
+static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu)
+{
+       /* shift to debug/test normalization and TIME_EXTENTS */
+       return buffer->clock() << DEBUG_SHIFT;
+}
+
  u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
  {
         u64 time;
  
         preempt_disable_notrace();
-       /* shift to debug/test normalization and TIME_EXTENTS */
-       time = buffer->clock() << DEBUG_SHIFT;
+       time = rb_time_stamp(buffer, cpu);
         preempt_enable_no_resched_notrace();
  
         return time;
@@ -523,6 +567,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
         cpu_buffer->cpu = cpu;
         cpu_buffer->buffer = buffer;
         spin_lock_init(&cpu_buffer->reader_lock);
+       lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
         cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
         INIT_LIST_HEAD(&cpu_buffer->pages);
  
@@ -593,7 +638,8 @@ static int rb_cpu_notify(struct notifier_block *self,
   * when the buffer wraps. If this flag is not set, the buffer will
   * drop data when the tail hits the head.
   */
-struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
+struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
+                                       struct lock_class_key *key)
  {
         struct ring_buffer *buffer;
         int bsize;
@@ -616,6 +662,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
         buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
         buffer->flags = flags;
         buffer->clock = trace_clock_local;
+       buffer->reader_lock_key = key;
  
         /* need at least two pages */
         if (buffer->pages == 1)
@@ -673,7 +720,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
         kfree(buffer);
         return NULL;
  }
-EXPORT_SYMBOL_GPL(ring_buffer_alloc);
+EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
  
  /**
   * ring_buffer_free - free a ring buffer.
@@ -947,31 +994,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
         return rb_page_commit(cpu_buffer->head_page);
  }
  
-/*
- * When the tail hits the head and the buffer is in overwrite mode,
- * the head jumps to the next page and all content on the previous
- * page is discarded. But before doing so, we update the overrun
- * variable of the buffer.
- */
-static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
-{
-       struct ring_buffer_event *event;
-       unsigned long head;
-
-       for (head = 0; head < rb_head_size(cpu_buffer);
-            head += rb_event_length(event)) {
-
-               event = __rb_page_index(cpu_buffer->head_page, head);
-               if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
-                       return;
-               /* Only count data entries */
-               if (event->type != RINGBUF_TYPE_DATA)
-                       continue;
-               cpu_buffer->overrun++;
-               cpu_buffer->entries--;
-       }
-}
-
  static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
                                struct buffer_page **bpage)
  {
@@ -991,7 +1013,7 @@ rb_event_index(struct ring_buffer_event *event)
         return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
  }
  
-static int
+static inline int
  rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
              struct ring_buffer_event *event)
  {
@@ -1110,28 +1132,21 @@ static void
  rb_update_event(struct ring_buffer_event *event,
                          unsigned type, unsigned length)
  {
-       event->type = type;
+       event->type_len = type;
  
         switch (type) {
  
         case RINGBUF_TYPE_PADDING:
-               break;
-
         case RINGBUF_TYPE_TIME_EXTEND:
-               event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT);
-               break;
-
         case RINGBUF_TYPE_TIME_STAMP:
-               event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);
                 break;
  
-       case RINGBUF_TYPE_DATA:
+       case 0:
                 length -= RB_EVNT_HDR_SIZE;
-               if (length > RB_MAX_SMALL_DATA) {
-                       event->len = 0;
+               if (length > RB_MAX_SMALL_DATA)
                         event->array[0] = length;
-               } else
-                       event->len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+               else
+                       event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
                 break;
         default:
                 BUG();
@@ -1155,131 +1170,156 @@ static unsigned rb_calculate_event_length(unsigned length)
         return length;
  }
  
+
  static struct ring_buffer_event *
-__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
-                 unsigned type, unsigned long length, u64 *ts)
+rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
+            unsigned long length, unsigned long tail,
+            struct buffer_page *commit_page,
+            struct buffer_page *tail_page, u64 *ts)
  {
-       struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
-       unsigned long tail, write;
+       struct buffer_page *next_page, *head_page, *reader_page;
         struct ring_buffer *buffer = cpu_buffer->buffer;
         struct ring_buffer_event *event;
-       unsigned long flags;
         bool lock_taken = false;
+       unsigned long flags;
  
-       commit_page = cpu_buffer->commit_page;
-       /* we just need to protect against interrupts */
-       barrier();
-       tail_page = cpu_buffer->tail_page;
-       write = local_add_return(length, &tail_page->write);
-       tail = write - length;
+       next_page = tail_page;
  
-       /* See if we shot pass the end of this buffer page */
-       if (write > BUF_PAGE_SIZE) {
-               struct buffer_page *next_page = tail_page;
+       local_irq_save(flags);
+       /*
+        * Since the write to the buffer is still not
+        * fully lockless, we must be careful with NMIs.
+        * The locks in the writers are taken when a write
+        * crosses to a new page. The locks protect against
+        * races with the readers (this will soon be fixed
+        * with a lockless solution).
+        *
+        * Because we can not protect against NMIs, and we
+        * want to keep traces reentrant, we need to manage
+        * what happens when we are in an NMI.
+        *
+        * NMIs can happen after we take the lock.
+        * If we are in an NMI, only take the lock
+        * if it is not already taken. Otherwise
+        * simply fail.
+        */
+       if (unlikely(in_nmi())) {
+               if (!__raw_spin_trylock(&cpu_buffer->lock)) {
+                       cpu_buffer->nmi_dropped++;
+                       goto out_reset;
+               }
+       } else
+               __raw_spin_lock(&cpu_buffer->lock);
  
-               local_irq_save(flags);
-               /*
-                * Since the write to the buffer is still not
-                * fully lockless, we must be careful with NMIs.
-                * The locks in the writers are taken when a write
-                * crosses to a new page. The locks protect against
-                * races with the readers (this will soon be fixed
-                * with a lockless solution).
-                *
-                * Because we can not protect against NMIs, and we
-                * want to keep traces reentrant, we need to manage
-                * what happens when we are in an NMI.
-                *
-                * NMIs can happen after we take the lock.
-                * If we are in an NMI, only take the lock
-                * if it is not already taken. Otherwise
-                * simply fail.
-                */
-               if (unlikely(in_nmi())) {
-                       if (!__raw_spin_trylock(&cpu_buffer->lock))
-                               goto out_reset;
-               } else
-                       __raw_spin_lock(&cpu_buffer->lock);
+       lock_taken = true;
  
-               lock_taken = true;
+       rb_inc_page(cpu_buffer, &next_page);
  
-               rb_inc_page(cpu_buffer, &next_page);
+       head_page = cpu_buffer->head_page;
+       reader_page = cpu_buffer->reader_page;
  
-               head_page = cpu_buffer->head_page;
-               reader_page = cpu_buffer->reader_page;
+       /* we grabbed the lock before incrementing */
+       if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
+               goto out_reset;
  
-               /* we grabbed the lock before incrementing */
-               if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
-                       goto out_reset;
+       /*
+        * If for some reason, we had an interrupt storm that made
+        * it all the way around the buffer, bail, and warn
+        * about it.
+        */
+       if (unlikely(next_page == commit_page)) {
+               cpu_buffer->commit_overrun++;
+               goto out_reset;
+       }
  
-               /*
-                * If for some reason, we had an interrupt storm that made
-                * it all the way around the buffer, bail, and warn
-                * about it.
-                */
-               if (unlikely(next_page == commit_page)) {
-                       WARN_ON_ONCE(1);
+       if (next_page == head_page) {
+               if (!(buffer->flags & RB_FL_OVERWRITE))
                         goto out_reset;
-               }
  
-               if (next_page == head_page) {
-                       if (!(buffer->flags & RB_FL_OVERWRITE))
-                               goto out_reset;
-
-                       /* tail_page has not moved yet? */
-                       if (tail_page == cpu_buffer->tail_page) {
-                               /* count overflows */
-                               rb_update_overflow(cpu_buffer);
+               /* tail_page has not moved yet? */
+               if (tail_page == cpu_buffer->tail_page) {
+                       /* count overflows */
+                       cpu_buffer->overrun +=
+                               local_read(&head_page->entries);
  
-                               rb_inc_page(cpu_buffer, &head_page);
-                               cpu_buffer->head_page = head_page;
-                               cpu_buffer->head_page->read = 0;
-                       }
+                       rb_inc_page(cpu_buffer, &head_page);
+                       cpu_buffer->head_page = head_page;
+                       cpu_buffer->head_page->read = 0;
                 }
+       }
  
-               /*
-                * If the tail page is still the same as what we think
-                * it is, then it is up to us to update the tail
-                * pointer.
-                */
-               if (tail_page == cpu_buffer->tail_page) {
-                       local_set(&next_page->write, 0);
-                       local_set(&next_page->page->commit, 0);
-                       cpu_buffer->tail_page = next_page;
+       /*
+        * If the tail page is still the same as what we think
+        * it is, then it is up to us to update the tail
+        * pointer.
+        */
+       if (tail_page == cpu_buffer->tail_page) {
+               local_set(&next_page->write, 0);
+               local_set(&next_page->entries, 0);
+               local_set(&next_page->page->commit, 0);
+               cpu_buffer->tail_page = next_page;
+
+               /* reread the time stamp */
+               *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
+               cpu_buffer->tail_page->page->time_stamp = *ts;
+       }
  
-                       /* reread the time stamp */
-                       *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
-                       cpu_buffer->tail_page->page->time_stamp = *ts;
-               }
+       /*
+        * The actual tail page has moved forward.
+        */
+       if (tail < BUF_PAGE_SIZE) {
+               /* Mark the rest of the page with padding */
+               event = __rb_page_index(tail_page, tail);
+               rb_event_set_padding(event);
+       }
  
-               /*
-                * The actual tail page has moved forward.
-                */
-               if (tail < BUF_PAGE_SIZE) {
-                       /* Mark the rest of the page with padding */
-                       event = __rb_page_index(tail_page, tail);
-                       rb_event_set_padding(event);
-               }
+       /* Set the write back to the previous setting */
+       local_sub(length, &tail_page->write);
  
-               if (tail <= BUF_PAGE_SIZE)
-                       /* Set the write back to the previous setting */
-                       local_set(&tail_page->write, tail);
+       /*
+        * If this was a commit entry that failed,
+        * increment that too
+        */
+       if (tail_page == cpu_buffer->commit_page &&
+           tail == rb_commit_index(cpu_buffer)) {
+               rb_set_commit_to_write(cpu_buffer);
+       }
  
-               /*
-                * If this was a commit entry that failed,
-                * increment that too
-                */
-               if (tail_page == cpu_buffer->commit_page &&
-                   tail == rb_commit_index(cpu_buffer)) {
-                       rb_set_commit_to_write(cpu_buffer);
-               }
+       __raw_spin_unlock(&cpu_buffer->lock);
+       local_irq_restore(flags);
+
+       /* fail and let the caller try again */
+       return ERR_PTR(-EAGAIN);
+
+ out_reset:
+       /* reset write */
+       local_sub(length, &tail_page->write);
  
+       if (likely(lock_taken))
                 __raw_spin_unlock(&cpu_buffer->lock);
-               local_irq_restore(flags);
+       local_irq_restore(flags);
+       return NULL;
+}
  
-               /* fail and let the caller try again */
-               return ERR_PTR(-EAGAIN);
-       }
+static struct ring_buffer_event *
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+                 unsigned type, unsigned long length, u64 *ts)
+{
+       struct buffer_page *tail_page, *commit_page;
+       struct ring_buffer_event *event;
+       unsigned long tail, write;
+
+       commit_page = cpu_buffer->commit_page;
+       /* we just need to protect against interrupts */
+       barrier();
+       tail_page = cpu_buffer->tail_page;
+       write = local_add_return(length, &tail_page->write);
+       tail = write - length;
+
+       /* See if we shot pass the end of this buffer page */
+       if (write > BUF_PAGE_SIZE)
+               return rb_move_tail(cpu_buffer, length, tail,
+                                   commit_page, tail_page, ts);
  
         /* We reserved something on the buffer */
  
@@ -1289,6 +1329,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
         event = __rb_page_index(tail_page, tail);
         rb_update_event(event, type, length);
  
+       /* The passed in type is zero for DATA */
+       if (likely(!type))
+               local_inc(&tail_page->entries);
+
         /*
          * If this is a commit and the tail is zero, then update
          * this page's time stamp.
@@ -1297,16 +1341,38 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
                 cpu_buffer->commit_page->page->time_stamp = *ts;
  
         return event;
+}
  
- out_reset:
-       /* reset write */
-       if (tail <= BUF_PAGE_SIZE)
-               local_set(&tail_page->write, tail);
+static inline int
+rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
+                 struct ring_buffer_event *event)
+{
+       unsigned long new_index, old_index;
+       struct buffer_page *bpage;
+       unsigned long index;
+       unsigned long addr;
  
-       if (likely(lock_taken))
-               __raw_spin_unlock(&cpu_buffer->lock);
-       local_irq_restore(flags);
-       return NULL;
+       new_index = rb_event_index(event);
+       old_index = new_index + rb_event_length(event);
+       addr = (unsigned long)event;
+       addr &= PAGE_MASK;
+
+       bpage = cpu_buffer->tail_page;
+
+       if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
+               /*
+                * This is on the tail page. It is possible that
+                * a write could come in and move the tail page
+                * and write to the next page. That is fine
+                * because we just shorten what is on this page.
+                */
+               index = local_cmpxchg(&bpage->write, old_index, new_index);
+               if (index == old_index)
+                       return 1;
+       }
+
+       /* could not discard */
+       return 0;
  }
  
  static int
@@ -1351,16 +1417,23 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
                         event->array[0] = *delta >> TS_SHIFT;
                 } else {
                         cpu_buffer->commit_page->page->time_stamp = *ts;
-                       event->time_delta = 0;
-                       event->array[0] = 0;
+                       /* try to discard, since we do not need this */
+                       if (!rb_try_to_discard(cpu_buffer, event)) {
+                               /* nope, just zero it */
+                               event->time_delta = 0;
+                               event->array[0] = 0;
+                       }
                 }
                 cpu_buffer->write_stamp = *ts;
                 /* let the caller know this was the commit */
                 ret = 1;
         } else {
-               /* Darn, this is just wasted space */
-               event->time_delta = 0;
-               event->array[0] = 0;
+               /* Try to discard the event */
+               if (!rb_try_to_discard(cpu_buffer, event)) {
+                       /* Darn, this is just wasted space */
+                       event->time_delta = 0;
+                       event->array[0] = 0;
+               }
                 ret = 0;
         }
  
@@ -1371,13 +1444,14 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
  
  static struct ring_buffer_event *
  rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
-                     unsigned type, unsigned long length)
+                     unsigned long length)
  {
         struct ring_buffer_event *event;
-       u64 ts, delta;
+       u64 ts, delta = 0;
         int commit = 0;
         int nr_loops = 0;
  
+       length = rb_calculate_event_length(length);
   again:
         /*
          * We allow for interrupts to reenter here and do a trace.
@@ -1391,7 +1465,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
         if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
                 return NULL;
  
-       ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
+       ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
  
         /*
          * Only the first commit can update the timestamp.
@@ -1401,23 +1475,24 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
          * also be made. But only the entry that did the actual
          * commit will be something other than zero.
          */
-       if (cpu_buffer->tail_page == cpu_buffer->commit_page &&
-           rb_page_write(cpu_buffer->tail_page) ==
-           rb_commit_index(cpu_buffer)) {
+       if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
+                  rb_page_write(cpu_buffer->tail_page) ==
+                  rb_commit_index(cpu_buffer))) {
+               u64 diff;
  
-               delta = ts - cpu_buffer->write_stamp;
+               diff = ts - cpu_buffer->write_stamp;
  
-               /* make sure this delta is calculated here */
+               /* make sure this diff is calculated here */
                 barrier();
  
                 /* Did the write stamp get updated already? */
                 if (unlikely(ts < cpu_buffer->write_stamp))
-                       delta = 0;
+                       goto get_event;
  
-               if (test_time_stamp(delta)) {
+               delta = diff;
+               if (unlikely(test_time_stamp(delta))) {
  
                         commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
-
                         if (commit == -EBUSY)
                                 return NULL;
  
@@ -1426,12 +1501,11 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
  
                         RB_WARN_ON(cpu_buffer, commit < 0);
                 }
-       } else
-               /* Non commits have zero deltas */
-               delta = 0;
+       }
  
-       event = __rb_reserve_next(cpu_buffer, type, length, &ts);
-       if (PTR_ERR(event) == -EAGAIN)
+ get_event:
+       event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
+       if (unlikely(PTR_ERR(event) == -EAGAIN))
                 goto again;
  
         if (!event) {
@@ -1448,7 +1522,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
          * If the timestamp was commited, make the commit our entry
          * now so that we will update it when needed.
          */
-       if (commit)
+       if (unlikely(commit))
                 rb_set_commit_event(cpu_buffer, event);
         else if (!rb_is_commit(cpu_buffer, event))
                 delta = 0;
@@ -1458,6 +1532,36 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
         return event;
  }
  
+#define TRACE_RECURSIVE_DEPTH 16
+
+static int trace_recursive_lock(void)
+{
+       current->trace_recursion++;
+
+       if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+               return 0;
+
+       /* Disable all tracing before we do anything else */
+       tracing_off_permanent();
+
+       printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
+                   "HC[%lu]:SC[%lu]:NMI[%lu]\n",
+                   current->trace_recursion,
+                   hardirq_count() >> HARDIRQ_SHIFT,
+                   softirq_count() >> SOFTIRQ_SHIFT,
+                   in_nmi());
+
+       WARN_ON_ONCE(1);
+       return -1;
+}
+
+static void trace_recursive_unlock(void)
+{
+       WARN_ON_ONCE(!current->trace_recursion);
+
+       current->trace_recursion--;
+}
+
  static DEFINE_PER_CPU(int, rb_need_resched);
  
  /**
@@ -1491,6 +1595,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
         /* If we are tracing schedule, we don't want to recurse */
         resched = ftrace_preempt_disable();
  
+       if (trace_recursive_lock())
+               goto out_nocheck;
+
         cpu = raw_smp_processor_id();
  
         if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -1501,11 +1608,10 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
         if (atomic_read(&cpu_buffer->record_disabled))
                 goto out;
  
-       length = rb_calculate_event_length(length);
-       if (length > BUF_PAGE_SIZE)
+       if (length > BUF_MAX_DATA_SIZE)
                 goto out;
  
-       event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
+       event = rb_reserve_next_event(cpu_buffer, length);
         if (!event)
                 goto out;
  
@@ -1520,6 +1626,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
         return event;
  
   out:
+       trace_recursive_unlock();
+
+ out_nocheck:
         ftrace_preempt_enable(resched);
         return NULL;
  }
@@ -1528,7 +1637,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
  static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
                       struct ring_buffer_event *event)
  {
-       cpu_buffer->entries++;
+       local_inc(&cpu_buffer->entries);
  
         /* Only process further if we own the commit */
         if (!rb_is_commit(cpu_buffer, event))
@@ -1558,6 +1667,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
  
         rb_commit(cpu_buffer, event);
  
+       trace_recursive_unlock();
+
         /*
          * Only the last preempt count needs to restore preemption.
          */
@@ -1570,6 +1681,99 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
  }
  EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
  
+static inline void rb_event_discard(struct ring_buffer_event *event)
+{
+       /* array[0] holds the actual length for the discarded event */
+       event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
+       event->type_len = RINGBUF_TYPE_PADDING;
+       /* time delta must be non zero */
+       if (!event->time_delta)
+               event->time_delta = 1;
+}
+
+/**
+ * ring_buffer_event_discard - discard any event in the ring buffer
+ * @event: the event to discard
+ *
+ * Sometimes a event that is in the ring buffer needs to be ignored.
+ * This function lets the user discard an event in the ring buffer
+ * and then that event will not be read later.
+ *
+ * Note, it is up to the user to be careful with this, and protect
+ * against races. If the user discards an event that has been consumed
+ * it is possible that it could corrupt the ring buffer.
+ */
+void ring_buffer_event_discard(struct ring_buffer_event *event)
+{
+       rb_event_discard(event);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
+
+/**
+ * ring_buffer_commit_discard - discard an event that has not been committed
+ * @buffer: the ring buffer
+ * @event: non committed event to discard
+ *
+ * This is similar to ring_buffer_event_discard but must only be
+ * performed on an event that has not been committed yet. The difference
+ * is that this will also try to free the event from the ring buffer
+ * if another event has not been added behind it.
+ *
+ * If another event has been added behind it, it will set the event
+ * up as discarded, and perform the commit.
+ *
+ * If this function is called, do not call ring_buffer_unlock_commit on
+ * the event.
+ */
+void ring_buffer_discard_commit(struct ring_buffer *buffer,
+                               struct ring_buffer_event *event)
+{
+       struct ring_buffer_per_cpu *cpu_buffer;
+       int cpu;
+
+       /* The event is discarded regardless */
+       rb_event_discard(event);
+
+       /*
+        * This must only be called if the event has not been
+        * committed yet. Thus we can assume that preemption
+        * is still disabled.
+        */
+       RB_WARN_ON(buffer, preemptible());
+
+       cpu = smp_processor_id();
+       cpu_buffer = buffer->buffers[cpu];
+
+       if (!rb_try_to_discard(cpu_buffer, event))
+               goto out;
+
+       /*
+        * The commit is still visible by the reader, so we
+        * must increment entries.
+        */
+       local_inc(&cpu_buffer->entries);
+ out:
+       /*
+        * If a write came in and pushed the tail page
+        * we still need to update the commit pointer
+        * if we were the commit.
+        */
+       if (rb_is_commit(cpu_buffer, event))
+               rb_set_commit_to_write(cpu_buffer);
+
+       trace_recursive_unlock();
+
+       /*
+        * Only the last preempt count needs to restore preemption.
+        */
+       if (preempt_count() == 1)
+               ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
+       else
+               preempt_enable_no_resched_notrace();
+
+}
+EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
+
  /**
   * ring_buffer_write - write data to the buffer without reserving
   * @buffer: The ring buffer to write to.
@@ -1589,7 +1793,6 @@ int ring_buffer_write(struct ring_buffer *buffer,
  {
         struct ring_buffer_per_cpu *cpu_buffer;
         struct ring_buffer_event *event;
-       unsigned long event_length;
         void *body;
         int ret = -EBUSY;
         int cpu, resched;
@@ -1612,9 +1815,10 @@ int ring_buffer_write(struct ring_buffer *buffer,
         if (atomic_read(&cpu_buffer->record_disabled))
                 goto out;
  
-       event_length = rb_calculate_event_length(length);
-       event = rb_reserve_next_event(cpu_buffer,
-                                     RINGBUF_TYPE_DATA, event_length);
+       if (length > BUF_MAX_DATA_SIZE)
+               goto out;
+
+       event = rb_reserve_next_event(cpu_buffer, length);
         if (!event)
                 goto out;
  
@@ -1728,7 +1932,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
                 return 0;
  
         cpu_buffer = buffer->buffers[cpu];
-       ret = cpu_buffer->entries;
+       ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun)
+               - cpu_buffer->read;
  
         return ret;
  }
@@ -1754,6 +1959,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
  }
  EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
  
+/**
+ * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
+{
+       struct ring_buffer_per_cpu *cpu_buffer;
+       unsigned long ret;
+
+       if (!cpumask_test_cpu(cpu, buffer->cpumask))
+               return 0;
+
+       cpu_buffer = buffer->buffers[cpu];
+       ret = cpu_buffer->nmi_dropped;
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
+
+/**
+ * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long
+ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+       struct ring_buffer_per_cpu *cpu_buffer;
+       unsigned long ret;
+
+       if (!cpumask_test_cpu(cpu, buffer->cpumask))
+               return 0;
+
+       cpu_buffer = buffer->buffers[cpu];
+       ret = cpu_buffer->commit_overrun;
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
+
  /**
   * ring_buffer_entries - get the number of entries in a buffer
   * @buffer: The ring buffer
@@ -1770,7 +2016,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
         /* if you care about this being correct, lock the buffer */
         for_each_buffer_cpu(buffer, cpu) {
                 cpu_buffer = buffer->buffers[cpu];
-               entries += cpu_buffer->entries;
+               entries += (local_read(&cpu_buffer->entries) -
+                           cpu_buffer->overrun) - cpu_buffer->read;
         }
  
         return entries;
@@ -1862,7 +2109,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
  {
         u64 delta;
  
-       switch (event->type) {
+       switch (event->type_len) {
         case RINGBUF_TYPE_PADDING:
                 return;
  
@@ -1893,7 +2140,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
  {
         u64 delta;
  
-       switch (event->type) {
+       switch (event->type_len) {
         case RINGBUF_TYPE_PADDING:
                 return;
  
@@ -1966,6 +2213,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
         cpu_buffer->reader_page->list.prev = reader->list.prev;
  
         local_set(&cpu_buffer->reader_page->write, 0);
+       local_set(&cpu_buffer->reader_page->entries, 0);
         local_set(&cpu_buffer->reader_page->page->commit, 0);
  
         /* Make the reader page now replace the head */
@@ -2008,8 +2256,9 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
  
         event = rb_reader_event(cpu_buffer);
  
-       if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event))
-               cpu_buffer->entries--;
+       if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX
+                       || rb_discarded_event(event))
+               cpu_buffer->read++;
  
         rb_update_read_stamp(cpu_buffer, event);
  
@@ -2031,8 +2280,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
          * Check if we are at the end of the buffer.
          */
         if (iter->head >= rb_page_size(iter->head_page)) {
-               if (RB_WARN_ON(buffer,
-                              iter->head_page == cpu_buffer->commit_page))
+               /* discarded commits can make the page empty */
+               if (iter->head_page == cpu_buffer->commit_page)
                         return;
                 rb_inc_iter(iter);
                 return;
@@ -2075,12 +2324,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
         /*
          * We repeat when a timestamp is encountered. It is possible
          * to get multiple timestamps from an interrupt entering just
-        * as one timestamp is about to be written. The max times
-        * that this can happen is the number of nested interrupts we
-        * can have.  Nesting 10 deep of interrupts is clearly
-        * an anomaly.
+        * as one timestamp is about to be written, or from discarded
+        * commits. The most that we can have is the number on a single page.
          */
-       if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+       if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
                 return NULL;
  
         reader = rb_get_reader_page(cpu_buffer);
@@ -2089,7 +2336,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
  
         event = rb_reader_event(cpu_buffer);
  
-       switch (event->type) {
+       switch (event->type_len) {
         case RINGBUF_TYPE_PADDING:
                 if (rb_null_event(event))
                         RB_WARN_ON(cpu_buffer, 1);
@@ -2146,14 +2393,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
  
   again:
         /*
-        * We repeat when a timestamp is encountered. It is possible
-        * to get multiple timestamps from an interrupt entering just
-        * as one timestamp is about to be written. The max times
-        * that this can happen is the number of nested interrupts we
-        * can have. Nesting 10 deep of interrupts is clearly
-        * an anomaly.
+        * We repeat when a timestamp is encountered.
+        * We can get multiple timestamps by nested interrupts or also
+        * if filtering is on (discarding commits). Since discarding
+        * commits can be frequent we can get a lot of timestamps.
+        * But we limit them by not adding timestamps if they begin
+        * at the start of a page.
          */
-       if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+       if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
                 return NULL;
  
         if (rb_per_cpu_empty(cpu_buffer))
@@ -2161,7 +2408,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
  
         event = rb_iter_head_event(iter);
  
-       switch (event->type) {
+       switch (event->type_len) {
         case RINGBUF_TYPE_PADDING:
                 if (rb_null_event(event)) {
                         rb_inc_iter(iter);
@@ -2220,7 +2467,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
         event = rb_buffer_peek(buffer, cpu, ts);
         spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
  
-       if (event && event->type == RINGBUF_TYPE_PADDING) {
+       if (event && event->type_len == RINGBUF_TYPE_PADDING) {
                 cpu_relax();
                 goto again;
         }
@@ -2248,7 +2495,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
         event = rb_iter_peek(iter, ts);
         spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
  
-       if (event && event->type == RINGBUF_TYPE_PADDING) {
+       if (event && event->type_len == RINGBUF_TYPE_PADDING) {
                 cpu_relax();
                 goto again;
         }
@@ -2293,7 +2540,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
   out:
         preempt_enable();
  
-       if (event && event->type == RINGBUF_TYPE_PADDING) {
+       if (event && event->type_len == RINGBUF_TYPE_PADDING) {
                 cpu_relax();
                 goto again;
         }
@@ -2386,7 +2633,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
   out:
         spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
  
-       if (event && event->type == RINGBUF_TYPE_PADDING) {
+       if (event && event->type_len == RINGBUF_TYPE_PADDING) {
                 cpu_relax();
                 goto again;
         }
@@ -2411,6 +2658,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
         cpu_buffer->head_page
                 = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
         local_set(&cpu_buffer->head_page->write, 0);
+       local_set(&cpu_buffer->head_page->entries, 0);
         local_set(&cpu_buffer->head_page->page->commit, 0);
  
         cpu_buffer->head_page->read = 0;
@@ -2420,11 +2668,15 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
  
         INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
         local_set(&cpu_buffer->reader_page->write, 0);
+       local_set(&cpu_buffer->reader_page->entries, 0);
         local_set(&cpu_buffer->reader_page->page->commit, 0);
         cpu_buffer->reader_page->read = 0;
  
+       cpu_buffer->nmi_dropped = 0;
+       cpu_buffer->commit_overrun = 0;
         cpu_buffer->overrun = 0;
-       cpu_buffer->entries = 0;
+       cpu_buffer->read = 0;
+       local_set(&cpu_buffer->entries, 0);
  
         cpu_buffer->write_stamp = 0;
         cpu_buffer->read_stamp = 0;
@@ -2443,6 +2695,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
         if (!cpumask_test_cpu(cpu, buffer->cpumask))
                 return;
  
+       atomic_inc(&cpu_buffer->record_disabled);
+
         spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
  
         __raw_spin_lock(&cpu_buffer->lock);
@@ -2452,6 +2706,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
         __raw_spin_unlock(&cpu_buffer->lock);
  
         spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+       atomic_dec(&cpu_buffer->record_disabled);
  }
  EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
  
@@ -2578,28 +2834,6 @@ out:
  }
  EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
  
-static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
-                             struct buffer_data_page *bpage,
-                             unsigned int offset)
-{
-       struct ring_buffer_event *event;
-       unsigned long head;
-
-       __raw_spin_lock(&cpu_buffer->lock);
-       for (head = offset; head < local_read(&bpage->commit);
-            head += rb_event_length(event)) {
-
-               event = __rb_data_page_index(bpage, head);
-               if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
-                       return;
-               /* Only count data entries */
-               if (event->type != RINGBUF_TYPE_DATA)
-                       continue;
-               cpu_buffer->entries--;
-       }
-       __raw_spin_unlock(&cpu_buffer->lock);
-}
-
  /**
   * ring_buffer_alloc_read_page - allocate a page to read from buffer
   * @buffer: the buffer to allocate for.
@@ -2630,6 +2864,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
  
         return bpage;
  }
+EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
  
  /**
   * ring_buffer_free_read_page - free an allocated read page
@@ -2642,6 +2877,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  {
         free_page((unsigned long)data);
  }
+EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
  
  /**
   * ring_buffer_read_page - extract a page from the ring buffer
@@ -2768,16 +3004,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                 /* we copied everything to the beginning */
                 read = 0;
         } else {
+               /* update the entry counter */
+               cpu_buffer->read += local_read(&reader->entries);
+
                 /* swap the pages */
                 rb_init_page(bpage);
                 bpage = reader->page;
                 reader->page = *data_page;
                 local_set(&reader->write, 0);
+               local_set(&reader->entries, 0);
                 reader->read = 0;
                 *data_page = bpage;
-
-               /* update the entry counter */
-               rb_remove_entries(cpu_buffer, bpage, read);
         }
         ret = read;
  
@@ -2787,6 +3024,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
   out:
         return ret;
  }
+EXPORT_SYMBOL_GPL(ring_buffer_read_page);
  
  static ssize_t
  rb_simple_read(struct file *filp, char __user *ubuf,
@@ -2845,14 +3083,11 @@ static const struct file_operations rb_simple_fops = {
  static __init int rb_init_debugfs(void)
  {
         struct dentry *d_tracer;
-       struct dentry *entry;
  
         d_tracer = tracing_init_dentry();
  
-       entry = debugfs_create_file("tracing_on", 0644, d_tracer,
-                                   &ring_buffer_flags, &rb_simple_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'tracing_on' entry\n");
+       trace_create_file("tracing_on", 0644, d_tracer,
+                           &ring_buffer_flags, &rb_simple_fops);
  
         return 0;
  }
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c

new file mode 100644 (file)

index 0000000..8d68e14
--- /dev/null
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -0,0 +1,416 @@
+/*
+ * ring buffer tester and benchmark
+ *
+ * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/time.h>
+
+struct rb_page {
+       u64             ts;
+       local_t         commit;
+       char            data[4080];
+};
+
+/* run time and sleep time in seconds */
+#define RUN_TIME       10
+#define SLEEP_TIME     10
+
+/* number of events for writer to wake up the reader */
+static int wakeup_interval = 100;
+
+static int reader_finish;
+static struct completion read_start;
+static struct completion read_done;
+
+static struct ring_buffer *buffer;
+static struct task_struct *producer;
+static struct task_struct *consumer;
+static unsigned long read;
+
+static int disable_reader;
+module_param(disable_reader, uint, 0644);
+MODULE_PARM_DESC(disable_reader, "only run producer");
+
+static int read_events;
+
+static int kill_test;
+
+#define KILL_TEST()                            \
+       do {                                    \
+               if (!kill_test) {               \
+                       kill_test = 1;          \
+                       WARN_ON(1);             \
+               }                               \
+       } while (0)
+
+enum event_status {
+       EVENT_FOUND,
+       EVENT_DROPPED,
+};
+
+static enum event_status read_event(int cpu)
+{
+       struct ring_buffer_event *event;
+       int *entry;
+       u64 ts;
+
+       event = ring_buffer_consume(buffer, cpu, &ts);
+       if (!event)
+               return EVENT_DROPPED;
+
+       entry = ring_buffer_event_data(event);
+       if (*entry != cpu) {
+               KILL_TEST();
+               return EVENT_DROPPED;
+       }
+
+       read++;
+       return EVENT_FOUND;
+}
+
+static enum event_status read_page(int cpu)
+{
+       struct ring_buffer_event *event;
+       struct rb_page *rpage;
+       unsigned long commit;
+       void *bpage;
+       int *entry;
+       int ret;
+       int inc;
+       int i;
+
+       bpage = ring_buffer_alloc_read_page(buffer);
+       if (!bpage)
+               return EVENT_DROPPED;
+
+       ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
+       if (ret >= 0) {
+               rpage = bpage;
+               commit = local_read(&rpage->commit);
+               for (i = 0; i < commit && !kill_test; i += inc) {
+
+                       if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
+                               KILL_TEST();
+                               break;
+                       }
+
+                       inc = -1;
+                       event = (void *)&rpage->data[i];
+                       switch (event->type_len) {
+                       case RINGBUF_TYPE_PADDING:
+                               /* We don't expect any padding */
+                               KILL_TEST();
+                               break;
+                       case RINGBUF_TYPE_TIME_EXTEND:
+                               inc = 8;
+                               break;
+                       case 0:
+                               entry = ring_buffer_event_data(event);
+                               if (*entry != cpu) {
+                                       KILL_TEST();
+                                       break;
+                               }
+                               read++;
+                               if (!event->array[0]) {
+                                       KILL_TEST();
+                                       break;
+                               }
+                               inc = event->array[0];
+                               break;
+                       default:
+                               entry = ring_buffer_event_data(event);
+                               if (*entry != cpu) {
+                                       KILL_TEST();
+                                       break;
+                               }
+                               read++;
+                               inc = ((event->type_len + 1) * 4);
+                       }
+                       if (kill_test)
+                               break;
+
+                       if (inc <= 0) {
+                               KILL_TEST();
+                               break;
+                       }
+               }
+       }
+       ring_buffer_free_read_page(buffer, bpage);
+
+       if (ret < 0)
+               return EVENT_DROPPED;
+       return EVENT_FOUND;
+}
+
+static void ring_buffer_consumer(void)
+{
+       /* toggle between reading pages and events */
+       read_events ^= 1;
+
+       read = 0;
+       while (!reader_finish && !kill_test) {
+               int found;
+
+               do {
+                       int cpu;
+
+                       found = 0;
+                       for_each_online_cpu(cpu) {
+                               enum event_status stat;
+
+                               if (read_events)
+                                       stat = read_event(cpu);
+                               else
+                                       stat = read_page(cpu);
+
+                               if (kill_test)
+                                       break;
+                               if (stat == EVENT_FOUND)
+                                       found = 1;
+                       }
+               } while (found && !kill_test);
+
+               set_current_state(TASK_INTERRUPTIBLE);
+               if (reader_finish)
+                       break;
+
+               schedule();
+               __set_current_state(TASK_RUNNING);
+       }
+       reader_finish = 0;
+       complete(&read_done);
+}
+
+static void ring_buffer_producer(void)
+{
+       struct timeval start_tv;
+       struct timeval end_tv;
+       unsigned long long time;
+       unsigned long long entries;
+       unsigned long long overruns;
+       unsigned long missed = 0;
+       unsigned long hit = 0;
+       unsigned long avg;
+       int cnt = 0;
+
+       /*
+        * Hammer the buffer for 10 secs (this may
+        * make the system stall)
+        */
+       pr_info("Starting ring buffer hammer\n");
+       do_gettimeofday(&start_tv);
+       do {
+               struct ring_buffer_event *event;
+               int *entry;
+
+               event = ring_buffer_lock_reserve(buffer, 10);
+               if (!event) {
+                       missed++;
+               } else {
+                       hit++;
+                       entry = ring_buffer_event_data(event);
+                       *entry = smp_processor_id();
+                       ring_buffer_unlock_commit(buffer, event);
+               }
+               do_gettimeofday(&end_tv);
+
+               cnt++;
+               if (consumer && !(cnt % wakeup_interval))
+                       wake_up_process(consumer);
+
+#ifndef CONFIG_PREEMPT
+               /*
+                * If we are a non preempt kernel, the 10 second run will
+                * stop everything while it runs. Instead, we will call
+                * cond_resched and also add any time that was lost by a
+                * rescedule.
+                *
+                * Do a cond resched at the same frequency we would wake up
+                * the reader.
+                */
+               if (cnt % wakeup_interval)
+                       cond_resched();
+#endif
+
+       } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
+       pr_info("End ring buffer hammer\n");
+
+       if (consumer) {
+               /* Init both completions here to avoid races */
+               init_completion(&read_start);
+               init_completion(&read_done);
+               /* the completions must be visible before the finish var */
+               smp_wmb();
+               reader_finish = 1;
+               /* finish var visible before waking up the consumer */
+               smp_wmb();
+               wake_up_process(consumer);
+               wait_for_completion(&read_done);
+       }
+
+       time = end_tv.tv_sec - start_tv.tv_sec;
+       time *= USEC_PER_SEC;
+       time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
+
+       entries = ring_buffer_entries(buffer);
+       overruns = ring_buffer_overruns(buffer);
+
+       if (kill_test)
+               pr_info("ERROR!\n");
+       pr_info("Time:     %lld (usecs)\n", time);
+       pr_info("Overruns: %lld\n", overruns);
+       if (disable_reader)
+               pr_info("Read:     (reader disabled)\n");
+       else
+               pr_info("Read:     %ld  (by %s)\n", read,
+                       read_events ? "events" : "pages");
+       pr_info("Entries:  %lld\n", entries);
+       pr_info("Total:    %lld\n", entries + overruns + read);
+       pr_info("Missed:   %ld\n", missed);
+       pr_info("Hit:      %ld\n", hit);
+
+       /* Convert time from usecs to millisecs */
+       do_div(time, USEC_PER_MSEC);
+       if (time)
+               hit /= (long)time;
+       else
+               pr_info("TIME IS ZERO??\n");
+
+       pr_info("Entries per millisec: %ld\n", hit);
+
+       if (hit) {
+               /* Calculate the average time in nanosecs */
+               avg = NSEC_PER_MSEC / hit;
+               pr_info("%ld ns per entry\n", avg);
+       }
+
+       if (missed) {
+               if (time)
+                       missed /= (long)time;
+
+               pr_info("Total iterations per millisec: %ld\n", hit + missed);
+
+               /* it is possible that hit + missed will overflow and be zero */
+               if (!(hit + missed)) {
+                       pr_info("hit + missed overflowed and totalled zero!\n");
+                       hit--; /* make it non zero */
+               }
+
+               /* Caculate the average time in nanosecs */
+               avg = NSEC_PER_MSEC / (hit + missed);
+               pr_info("%ld ns per entry\n", avg);
+       }
+}
+
+static void wait_to_die(void)
+{
+       set_current_state(TASK_INTERRUPTIBLE);
+       while (!kthread_should_stop()) {
+               schedule();
+               set_current_state(TASK_INTERRUPTIBLE);
+       }
+       __set_current_state(TASK_RUNNING);
+}
+
+static int ring_buffer_consumer_thread(void *arg)
+{
+       while (!kthread_should_stop() && !kill_test) {
+               complete(&read_start);
+
+               ring_buffer_consumer();
+
+               set_current_state(TASK_INTERRUPTIBLE);
+               if (kthread_should_stop() || kill_test)
+                       break;
+
+               schedule();
+               __set_current_state(TASK_RUNNING);
+       }
+       __set_current_state(TASK_RUNNING);
+
+       if (kill_test)
+               wait_to_die();
+
+       return 0;
+}
+
+static int ring_buffer_producer_thread(void *arg)
+{
+       init_completion(&read_start);
+
+       while (!kthread_should_stop() && !kill_test) {
+               ring_buffer_reset(buffer);
+
+               if (consumer) {
+                       smp_wmb();
+                       wake_up_process(consumer);
+                       wait_for_completion(&read_start);
+               }
+
+               ring_buffer_producer();
+
+               pr_info("Sleeping for 10 secs\n");
+               set_current_state(TASK_INTERRUPTIBLE);
+               schedule_timeout(HZ * SLEEP_TIME);
+               __set_current_state(TASK_RUNNING);
+       }
+
+       if (kill_test)
+               wait_to_die();
+
+       return 0;
+}
+
+static int __init ring_buffer_benchmark_init(void)
+{
+       int ret;
+
+       /* make a one meg buffer in overwite mode */
+       buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE);
+       if (!buffer)
+               return -ENOMEM;
+
+       if (!disable_reader) {
+               consumer = kthread_create(ring_buffer_consumer_thread,
+                                         NULL, "rb_consumer");
+               ret = PTR_ERR(consumer);
+               if (IS_ERR(consumer))
+                       goto out_fail;
+       }
+
+       producer = kthread_run(ring_buffer_producer_thread,
+                              NULL, "rb_producer");
+       ret = PTR_ERR(producer);
+
+       if (IS_ERR(producer))
+               goto out_kill;
+
+       return 0;
+
+ out_kill:
+       if (consumer)
+               kthread_stop(consumer);
+
+ out_fail:
+       ring_buffer_free(buffer);
+       return ret;
+}
+
+static void __exit ring_buffer_benchmark_exit(void)
+{
+       kthread_stop(producer);
+       if (consumer)
+               kthread_stop(consumer);
+       ring_buffer_free(buffer);
+}
+
+module_init(ring_buffer_benchmark_init);
+module_exit(ring_buffer_benchmark_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("ring_buffer_benchmark");
+MODULE_LICENSE("GPL");
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c

index cda81ec58d9fe1e723388826de8966b274b71d0a..8acd9b81a5d76046ee52c9d1dc9224cc43edb1c2 100644 (file)
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -171,6 +171,13 @@ static struct trace_array  global_trace;
  
  static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
  
+int filter_current_check_discard(struct ftrace_event_call *call, void *rec,
+                                struct ring_buffer_event *event)
+{
+       return filter_check_discard(call, rec, global_trace.buffer, event);
+}
+EXPORT_SYMBOL_GPL(filter_current_check_discard);
+
  cycle_t ftrace_now(int cpu)
  {
         u64 ts;
@@ -255,7 +262,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
  
  /* trace_flags holds trace_options default values */
  unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
-       TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME;
+       TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
+       TRACE_ITER_GRAPH_TIME;
  
  /**
   * trace_wake_up - wake up tasks waiting for trace input
@@ -317,6 +325,7 @@ static const char *trace_options[] = {
         "latency-format",
         "global-clock",
         "sleep-time",
+       "graph-time",
         NULL
  };
  
@@ -402,17 +411,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
         return cnt;
  }
  
-static void
-trace_print_seq(struct seq_file *m, struct trace_seq *s)
-{
-       int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
-
-       s->buffer[len] = 0;
-       seq_puts(m, s->buffer);
-
-       trace_seq_init(s);
-}
-
  /**
   * update_max_tr - snapshot all trace buffers from global_trace to max_tr
   * @tr: tracer
@@ -641,6 +639,16 @@ void tracing_reset_online_cpus(struct trace_array *tr)
                 tracing_reset(tr, cpu);
  }
  
+void tracing_reset_current(int cpu)
+{
+       tracing_reset(&global_trace, cpu);
+}
+
+void tracing_reset_current_online_cpus(void)
+{
+       tracing_reset_online_cpus(&global_trace);
+}
+
  #define SAVED_CMDLINES 128
  #define NO_CMDLINE_MAP UINT_MAX
  static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
@@ -800,6 +808,7 @@ void trace_find_cmdline(int pid, char comm[])
                 return;
         }
  
+       preempt_disable();
         __raw_spin_lock(&trace_cmdline_lock);
         map = map_pid_to_cmdline[pid];
         if (map != NO_CMDLINE_MAP)
@@ -808,6 +817,7 @@ void trace_find_cmdline(int pid, char comm[])
                 strcpy(comm, "<...>");
  
         __raw_spin_unlock(&trace_cmdline_lock);
+       preempt_enable();
  }
  
  void tracing_record_cmdline(struct task_struct *tsk)
@@ -840,7 +850,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
  }
  
  struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
-                                                   unsigned char type,
+                                                   int type,
                                                     unsigned long len,
                                                     unsigned long flags, int pc)
  {
@@ -883,30 +893,40 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
  }
  
  struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+trace_current_buffer_lock_reserve(int type, unsigned long len,
                                   unsigned long flags, int pc)
  {
         return trace_buffer_lock_reserve(&global_trace,
                                          type, len, flags, pc);
  }
+EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
  
  void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
                                         unsigned long flags, int pc)
  {
-       return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
+       __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
  }
+EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
  
  void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
                                         unsigned long flags, int pc)
  {
-       return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
+       __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
+}
+EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
+
+void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
+{
+       ring_buffer_discard_commit(global_trace.buffer, event);
  }
+EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
  
  void
  trace_function(struct trace_array *tr,
                unsigned long ip, unsigned long parent_ip, unsigned long flags,
                int pc)
  {
+       struct ftrace_event_call *call = &event_function;
         struct ring_buffer_event *event;
         struct ftrace_entry *entry;
  
@@ -921,7 +941,9 @@ trace_function(struct trace_array *tr,
         entry   = ring_buffer_event_data(event);
         entry->ip                       = ip;
         entry->parent_ip                = parent_ip;
-       ring_buffer_unlock_commit(tr->buffer, event);
+
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
  }
  
  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -930,6 +952,7 @@ static int __trace_graph_entry(struct trace_array *tr,
                                 unsigned long flags,
                                 int pc)
  {
+       struct ftrace_event_call *call = &event_funcgraph_entry;
         struct ring_buffer_event *event;
         struct ftrace_graph_ent_entry *entry;
  
@@ -942,7 +965,8 @@ static int __trace_graph_entry(struct trace_array *tr,
                 return 0;
         entry   = ring_buffer_event_data(event);
         entry->graph_ent                        = *trace;
-       ring_buffer_unlock_commit(global_trace.buffer, event);
+       if (!filter_current_check_discard(call, entry, event))
+               ring_buffer_unlock_commit(global_trace.buffer, event);
  
         return 1;
  }
@@ -952,6 +976,7 @@ static void __trace_graph_return(struct trace_array *tr,
                                 unsigned long flags,
                                 int pc)
  {
+       struct ftrace_event_call *call = &event_funcgraph_exit;
         struct ring_buffer_event *event;
         struct ftrace_graph_ret_entry *entry;
  
@@ -964,7 +989,8 @@ static void __trace_graph_return(struct trace_array *tr,
                 return;
         entry   = ring_buffer_event_data(event);
         entry->ret                              = *trace;
-       ring_buffer_unlock_commit(global_trace.buffer, event);
+       if (!filter_current_check_discard(call, entry, event))
+               ring_buffer_unlock_commit(global_trace.buffer, event);
  }
  #endif
  
@@ -982,6 +1008,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
                                  int skip, int pc)
  {
  #ifdef CONFIG_STACKTRACE
+       struct ftrace_event_call *call = &event_kernel_stack;
         struct ring_buffer_event *event;
         struct stack_entry *entry;
         struct stack_trace trace;
@@ -999,7 +1026,8 @@ static void __ftrace_trace_stack(struct trace_array *tr,
         trace.entries           = entry->caller;
  
         save_stack_trace(&trace);
-       ring_buffer_unlock_commit(tr->buffer, event);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
  #endif
  }
  
@@ -1024,6 +1052,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
                                    unsigned long flags, int pc)
  {
  #ifdef CONFIG_STACKTRACE
+       struct ftrace_event_call *call = &event_user_stack;
         struct ring_buffer_event *event;
         struct userstack_entry *entry;
         struct stack_trace trace;
@@ -1045,7 +1074,8 @@ static void ftrace_trace_userstack(struct trace_array *tr,
         trace.entries           = entry->caller;
  
         save_stack_trace_user(&trace);
-       ring_buffer_unlock_commit(tr->buffer, event);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
  #endif
  }
  
@@ -1089,6 +1119,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
                            struct task_struct *next,
                            unsigned long flags, int pc)
  {
+       struct ftrace_event_call *call = &event_context_switch;
         struct ring_buffer_event *event;
         struct ctx_switch_entry *entry;
  
@@ -1104,7 +1135,9 @@ tracing_sched_switch_trace(struct trace_array *tr,
         entry->next_prio                = next->prio;
         entry->next_state               = next->state;
         entry->next_cpu = task_cpu(next);
-       trace_buffer_unlock_commit(tr, event, flags, pc);
+
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               trace_buffer_unlock_commit(tr, event, flags, pc);
  }
  
  void
@@ -1113,6 +1146,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
                            struct task_struct *curr,
                            unsigned long flags, int pc)
  {
+       struct ftrace_event_call *call = &event_wakeup;
         struct ring_buffer_event *event;
         struct ctx_switch_entry *entry;
  
@@ -1129,7 +1163,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
         entry->next_state               = wakee->state;
         entry->next_cpu                 = task_cpu(wakee);
  
-       ring_buffer_unlock_commit(tr->buffer, event);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
         ftrace_trace_stack(tr, flags, 6, pc);
         ftrace_trace_userstack(tr, flags, pc);
  }
@@ -1230,11 +1265,13 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
                 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
         static u32 trace_buf[TRACE_BUF_SIZE];
  
+       struct ftrace_event_call *call = &event_bprint;
         struct ring_buffer_event *event;
         struct trace_array *tr = &global_trace;
         struct trace_array_cpu *data;
         struct bprint_entry *entry;
         unsigned long flags;
+       int disable;
         int resched;
         int cpu, len = 0, size, pc;
  
@@ -1249,7 +1286,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
         cpu = raw_smp_processor_id();
         data = tr->data[cpu];
  
-       if (unlikely(atomic_read(&data->disabled)))
+       disable = atomic_inc_return(&data->disabled);
+       if (unlikely(disable != 1))
                 goto out;
  
         /* Lockdep uses trace_printk for lock tracing */
@@ -1269,13 +1307,15 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
         entry->fmt                      = fmt;
  
         memcpy(entry->buf, trace_buf, sizeof(u32) * len);
-       ring_buffer_unlock_commit(tr->buffer, event);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
  
  out_unlock:
         __raw_spin_unlock(&trace_buf_lock);
         local_irq_restore(flags);
  
  out:
+       atomic_dec_return(&data->disabled);
         ftrace_preempt_enable(resched);
         unpause_graph_tracing();
  
@@ -1288,12 +1328,14 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
         static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
         static char trace_buf[TRACE_BUF_SIZE];
  
+       struct ftrace_event_call *call = &event_print;
         struct ring_buffer_event *event;
         struct trace_array *tr = &global_trace;
         struct trace_array_cpu *data;
         int cpu, len = 0, size, pc;
         struct print_entry *entry;
         unsigned long irq_flags;
+       int disable;
  
         if (tracing_disabled || tracing_selftest_running)
                 return 0;
@@ -1303,7 +1345,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
         cpu = raw_smp_processor_id();
         data = tr->data[cpu];
  
-       if (unlikely(atomic_read(&data->disabled)))
+       disable = atomic_inc_return(&data->disabled);
+       if (unlikely(disable != 1))
                 goto out;
  
         pause_graph_tracing();
@@ -1323,13 +1366,15 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
  
         memcpy(&entry->buf, trace_buf, len);
         entry->buf[len] = 0;
-       ring_buffer_unlock_commit(tr->buffer, event);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
  
   out_unlock:
         __raw_spin_unlock(&trace_buf_lock);
         raw_local_irq_restore(irq_flags);
         unpause_graph_tracing();
   out:
+       atomic_dec_return(&data->disabled);
         preempt_enable_notrace();
  
         return len;
@@ -1526,12 +1571,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
                 p = s_next(m, p, &l);
         }
  
+       trace_event_read_lock();
         return p;
  }
  
  static void s_stop(struct seq_file *m, void *p)
  {
         atomic_dec(&trace_record_cmdline_disabled);
+       trace_event_read_unlock();
  }
  
  static void print_lat_help_header(struct seq_file *m)
@@ -1774,6 +1821,7 @@ static int trace_empty(struct trace_iterator *iter)
         return 1;
  }
  
+/*  Called with trace_event_read_lock() held. */
  static enum print_line_t print_trace_line(struct trace_iterator *iter)
  {
         enum print_line_t ret;
@@ -2396,6 +2444,56 @@ static const struct file_operations tracing_readme_fops = {
         .read           = tracing_readme_read,
  };
  
+static ssize_t
+tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
+                               size_t cnt, loff_t *ppos)
+{
+       char *buf_comm;
+       char *file_buf;
+       char *buf;
+       int len = 0;
+       int pid;
+       int i;
+
+       file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL);
+       if (!file_buf)
+               return -ENOMEM;
+
+       buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL);
+       if (!buf_comm) {
+               kfree(file_buf);
+               return -ENOMEM;
+       }
+
+       buf = file_buf;
+
+       for (i = 0; i < SAVED_CMDLINES; i++) {
+               int r;
+
+               pid = map_cmdline_to_pid[i];
+               if (pid == -1 || pid == NO_CMDLINE_MAP)
+                       continue;
+
+               trace_find_cmdline(pid, buf_comm);
+               r = sprintf(buf, "%d %s\n", pid, buf_comm);
+               buf += r;
+               len += r;
+       }
+
+       len = simple_read_from_buffer(ubuf, cnt, ppos,
+                                     file_buf, len);
+
+       kfree(file_buf);
+       kfree(buf_comm);
+
+       return len;
+}
+
+static const struct file_operations tracing_saved_cmdlines_fops = {
+    .open       = tracing_open_generic,
+    .read       = tracing_saved_cmdlines_read,
+};
+
  static ssize_t
  tracing_ctrl_read(struct file *filp, char __user *ubuf,
                   size_t cnt, loff_t *ppos)
@@ -2728,6 +2826,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
         /* trace pipe does not show start of buffer */
         cpumask_setall(iter->started);
  
+       if (trace_flags & TRACE_ITER_LATENCY_FMT)
+               iter->iter_flags |= TRACE_FILE_LAT_FMT;
+
         iter->cpu_file = cpu_file;
         iter->tr = &global_trace;
         mutex_init(&iter->mutex);
@@ -2915,6 +3016,7 @@ waitagain:
                offsetof(struct trace_iterator, seq));
         iter->pos = -1;
  
+       trace_event_read_lock();
         while (find_next_entry_inc(iter) != NULL) {
                 enum print_line_t ret;
                 int len = iter->seq.len;
@@ -2931,6 +3033,7 @@ waitagain:
                 if (iter->seq.len >= cnt)
                         break;
         }
+       trace_event_read_unlock();
  
         /* Now copy what we have to the user */
         sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
@@ -3053,6 +3156,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                 goto out_err;
         }
  
+       trace_event_read_lock();
+
         /* Fill as many pages as possible. */
         for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
                 pages[i] = alloc_page(GFP_KERNEL);
@@ -3075,6 +3180,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                 trace_seq_init(&iter->seq);
         }
  
+       trace_event_read_unlock();
         mutex_unlock(&iter->mutex);
  
         spd.nr_pages = i;
@@ -3425,7 +3531,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                 .spd_release    = buffer_spd_release,
         };
         struct buffer_ref *ref;
-       int size, i;
+       int entries, size, i;
         size_t ret;
  
         if (*ppos & (PAGE_SIZE - 1)) {
@@ -3440,7 +3546,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                 len &= PAGE_MASK;
         }
  
-       for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) {
+       entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+
+       for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
                 struct page *page;
                 int r;
  
@@ -3457,7 +3565,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                 }
  
                 r = ring_buffer_read_page(ref->buffer, &ref->page,
-                                         len, info->cpu, 0);
+                                         len, info->cpu, 1);
                 if (r < 0) {
                         ring_buffer_free_read_page(ref->buffer,
                                                    ref->page);
@@ -3481,6 +3589,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                 spd.partial[i].private = (unsigned long)ref;
                 spd.nr_pages++;
                 *ppos += PAGE_SIZE;
+
+               entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
         }
  
         spd.nr_pages = i;
@@ -3508,6 +3618,45 @@ static const struct file_operations tracing_buffers_fops = {
         .llseek         = no_llseek,
  };
  
+static ssize_t
+tracing_stats_read(struct file *filp, char __user *ubuf,
+                  size_t count, loff_t *ppos)
+{
+       unsigned long cpu = (unsigned long)filp->private_data;
+       struct trace_array *tr = &global_trace;
+       struct trace_seq *s;
+       unsigned long cnt;
+
+       s = kmalloc(sizeof(*s), GFP_ATOMIC);
+       if (!s)
+               return ENOMEM;
+
+       trace_seq_init(s);
+
+       cnt = ring_buffer_entries_cpu(tr->buffer, cpu);
+       trace_seq_printf(s, "entries: %ld\n", cnt);
+
+       cnt = ring_buffer_overrun_cpu(tr->buffer, cpu);
+       trace_seq_printf(s, "overrun: %ld\n", cnt);
+
+       cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
+       trace_seq_printf(s, "commit overrun: %ld\n", cnt);
+
+       cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
+       trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
+
+       count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+
+       kfree(s);
+
+       return count;
+}
+
+static const struct file_operations tracing_stats_fops = {
+       .open           = tracing_open_generic,
+       .read           = tracing_stats_read,
+};
+
  #ifdef CONFIG_DYNAMIC_FTRACE
  
  int __weak ftrace_arch_read_dyn_info(char *buf, int size)
@@ -3597,7 +3746,7 @@ struct dentry *tracing_dentry_percpu(void)
  static void tracing_init_debugfs_percpu(long cpu)
  {
         struct dentry *d_percpu = tracing_dentry_percpu();
-       struct dentry *entry, *d_cpu;
+       struct dentry *d_cpu;
         /* strlen(cpu) + MAX(log10(cpu)) + '\0' */
         char cpu_dir[7];
  
@@ -3612,21 +3761,18 @@ static void tracing_init_debugfs_percpu(long cpu)
         }
  
         /* per cpu trace_pipe */
-       entry = debugfs_create_file("trace_pipe", 0444, d_cpu,
-                               (void *) cpu, &tracing_pipe_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'trace_pipe' entry\n");
+       trace_create_file("trace_pipe", 0444, d_cpu,
+                       (void *) cpu, &tracing_pipe_fops);
  
         /* per cpu trace */
-       entry = debugfs_create_file("trace", 0644, d_cpu,
-                               (void *) cpu, &tracing_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'trace' entry\n");
+       trace_create_file("trace", 0644, d_cpu,
+                       (void *) cpu, &tracing_fops);
+
+       trace_create_file("trace_pipe_raw", 0444, d_cpu,
+                       (void *) cpu, &tracing_buffers_fops);
  
-       entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu,
-                                   (void *) cpu, &tracing_buffers_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n");
+       trace_create_file("stats", 0444, d_cpu,
+                       (void *) cpu, &tracing_stats_fops);
  }
  
  #ifdef CONFIG_FTRACE_SELFTEST
@@ -3782,6 +3928,22 @@ static const struct file_operations trace_options_core_fops = {
         .write = trace_options_core_write,
  };
  
+struct dentry *trace_create_file(const char *name,
+                                mode_t mode,
+                                struct dentry *parent,
+                                void *data,
+                                const struct file_operations *fops)
+{
+       struct dentry *ret;
+
+       ret = debugfs_create_file(name, mode, parent, data, fops);
+       if (!ret)
+               pr_warning("Could not create debugfs '%s' entry\n", name);
+
+       return ret;
+}
+
+
  static struct dentry *trace_options_init_dentry(void)
  {
         struct dentry *d_tracer;
@@ -3809,7 +3971,6 @@ create_trace_option_file(struct trace_option_dentry *topt,
                          struct tracer_opt *opt)
  {
         struct dentry *t_options;
-       struct dentry *entry;
  
         t_options = trace_options_init_dentry();
         if (!t_options)
@@ -3818,11 +3979,9 @@ create_trace_option_file(struct trace_option_dentry *topt,
         topt->flags = flags;
         topt->opt = opt;
  
-       entry = debugfs_create_file(opt->name, 0644, t_options, topt,
+       topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
                                     &trace_options_fops);
  
-       topt->entry = entry;
-
  }
  
  static struct trace_option_dentry *
@@ -3877,123 +4036,84 @@ static struct dentry *
  create_trace_option_core_file(const char *option, long index)
  {
         struct dentry *t_options;
-       struct dentry *entry;
  
         t_options = trace_options_init_dentry();
         if (!t_options)
                 return NULL;
  
-       entry = debugfs_create_file(option, 0644, t_options, (void *)index,
+       return trace_create_file(option, 0644, t_options, (void *)index,
                                     &trace_options_core_fops);
-
-       return entry;
  }
  
  static __init void create_trace_options_dir(void)
  {
         struct dentry *t_options;
-       struct dentry *entry;
         int i;
  
         t_options = trace_options_init_dentry();
         if (!t_options)
                 return;
  
-       for (i = 0; trace_options[i]; i++) {
-               entry = create_trace_option_core_file(trace_options[i], i);
-               if (!entry)
-                       pr_warning("Could not create debugfs %s entry\n",
-                                  trace_options[i]);
-       }
+       for (i = 0; trace_options[i]; i++)
+               create_trace_option_core_file(trace_options[i], i);
  }
  
  static __init int tracer_init_debugfs(void)
  {
         struct dentry *d_tracer;
-       struct dentry *entry;
         int cpu;
  
         d_tracer = tracing_init_dentry();
  
-       entry = debugfs_create_file("tracing_enabled", 0644, d_tracer,
-                                   &global_trace, &tracing_ctrl_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
+       trace_create_file("tracing_enabled", 0644, d_tracer,
+                       &global_trace, &tracing_ctrl_fops);
  
-       entry = debugfs_create_file("trace_options", 0644, d_tracer,
-                                   NULL, &tracing_iter_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'trace_options' entry\n");
+       trace_create_file("trace_options", 0644, d_tracer,
+                       NULL, &tracing_iter_fops);
  
-       create_trace_options_dir();
+       trace_create_file("tracing_cpumask", 0644, d_tracer,
+                       NULL, &tracing_cpumask_fops);
+
+       trace_create_file("trace", 0644, d_tracer,
+                       (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
  
-       entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
-                                   NULL, &tracing_cpumask_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
-
-       entry = debugfs_create_file("trace", 0644, d_tracer,
-                                (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'trace' entry\n");
-
-       entry = debugfs_create_file("available_tracers", 0444, d_tracer,
-                                   &global_trace, &show_traces_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'available_tracers' entry\n");
-
-       entry = debugfs_create_file("current_tracer", 0444, d_tracer,
-                                   &global_trace, &set_tracer_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'current_tracer' entry\n");
-
-       entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
-                                   &tracing_max_latency,
-                                   &tracing_max_lat_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'tracing_max_latency' entry\n");
-
-       entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
-                                   &tracing_thresh, &tracing_max_lat_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'tracing_thresh' entry\n");
-       entry = debugfs_create_file("README", 0644, d_tracer,
-                                   NULL, &tracing_readme_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'README' entry\n");
-
-       entry = debugfs_create_file("trace_pipe", 0444, d_tracer,
+       trace_create_file("available_tracers", 0444, d_tracer,
+                       &global_trace, &show_traces_fops);
+
+       trace_create_file("current_tracer", 0644, d_tracer,
+                       &global_trace, &set_tracer_fops);
+
+       trace_create_file("tracing_max_latency", 0644, d_tracer,
+                       &tracing_max_latency, &tracing_max_lat_fops);
+
+       trace_create_file("tracing_thresh", 0644, d_tracer,
+                       &tracing_thresh, &tracing_max_lat_fops);
+
+       trace_create_file("README", 0444, d_tracer,
+                       NULL, &tracing_readme_fops);
+
+       trace_create_file("trace_pipe", 0444, d_tracer,
                         (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'trace_pipe' entry\n");
-
-       entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
-                                   &global_trace, &tracing_entries_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'buffer_size_kb' entry\n");
-
-       entry = debugfs_create_file("trace_marker", 0220, d_tracer,
-                                   NULL, &tracing_mark_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'trace_marker' entry\n");
+
+       trace_create_file("buffer_size_kb", 0644, d_tracer,
+                       &global_trace, &tracing_entries_fops);
+
+       trace_create_file("trace_marker", 0220, d_tracer,
+                       NULL, &tracing_mark_fops);
+
+       trace_create_file("saved_cmdlines", 0444, d_tracer,
+                       NULL, &tracing_saved_cmdlines_fops);
  
  #ifdef CONFIG_DYNAMIC_FTRACE
-       entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
-                                   &ftrace_update_tot_cnt,
-                                   &tracing_dyn_info_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'dyn_ftrace_total_info' entry\n");
+       trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
+                       &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
  #endif
  #ifdef CONFIG_SYSPROF_TRACER
         init_tracer_sysprof_debugfs(d_tracer);
  #endif
  
+       create_trace_options_dir();
+
         for_each_tracing_cpu(cpu)
                 tracing_init_debugfs_percpu(cpu);
  
@@ -4064,7 +4184,8 @@ trace_printk_seq(struct trace_seq *s)
  
  static void __ftrace_dump(bool disable_tracing)
  {
-       static DEFINE_SPINLOCK(ftrace_dump_lock);
+       static raw_spinlock_t ftrace_dump_lock =
+               (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
         /* use static because iter can be a bit big for the stack */
         static struct trace_iterator iter;
         unsigned int old_userobj;
@@ -4073,7 +4194,8 @@ static void __ftrace_dump(bool disable_tracing)
         int cnt = 0, cpu;
  
         /* only one dump */
-       spin_lock_irqsave(&ftrace_dump_lock, flags);
+       local_irq_save(flags);
+       __raw_spin_lock(&ftrace_dump_lock);
         if (dump_ran)
                 goto out;
  
@@ -4145,7 +4267,8 @@ static void __ftrace_dump(bool disable_tracing)
         }
  
   out:
-       spin_unlock_irqrestore(&ftrace_dump_lock, flags);
+       __raw_spin_unlock(&ftrace_dump_lock);
+       local_irq_restore(flags);
  }
  
  /* By default: disable tracing after the dump */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h

index e685ac2b2ba10f1dcf24fef92180d94b75f76367..6e735d4771f8ad9ddd971ee1806f904d7e44e176 100644 (file)
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,9 +9,12 @@
  #include <linux/mmiotrace.h>
  #include <linux/ftrace.h>
  #include <trace/boot.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
  #include <trace/power.h>
  
+#include <linux/trace_seq.h>
+#include <linux/ftrace_event.h>
+
  enum trace_type {
         __TRACE_FIRST_TYPE = 0,
  
@@ -41,20 +44,6 @@ enum trace_type {
         __TRACE_LAST_TYPE,
  };
  
-/*
- * The trace entry - the most basic unit of tracing. This is what
- * is printed in the end as a single line in the trace output, such as:
- *
- *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
- */
-struct trace_entry {
-       unsigned char           type;
-       unsigned char           flags;
-       unsigned char           preempt_count;
-       int                     pid;
-       int                     tgid;
-};
-
  /*
   * Function trace entry - function address and parent function addres:
   */
@@ -263,8 +252,6 @@ struct trace_array_cpu {
         char                    comm[TASK_COMM_LEN];
  };
  
-struct trace_iterator;
-
  /*
   * The trace array - an array of per-CPU trace arrays. This is the
   * highest level data structure that individual tracers deal with.
@@ -339,15 +326,6 @@ extern void __ftrace_bad_type(void);
                 __ftrace_bad_type();                                    \
         } while (0)
  
-/* Return values for print_line callback */
-enum print_line_t {
-       TRACE_TYPE_PARTIAL_LINE = 0,    /* Retry after flushing the seq */
-       TRACE_TYPE_HANDLED      = 1,
-       TRACE_TYPE_UNHANDLED    = 2,    /* Relay to other output functions */
-       TRACE_TYPE_NO_CONSUME   = 3     /* Handled but ask to not consume */
-};
-
-
  /*
   * An option specific to a tracer. This is a boolean value.
   * The bit is the bit index that sets its value on the
@@ -423,60 +401,30 @@ struct tracer {
         struct tracer_stat      *stats;
  };
  
-struct trace_seq {
-       unsigned char           buffer[PAGE_SIZE];
-       unsigned int            len;
-       unsigned int            readpos;
-};
-
-static inline void
-trace_seq_init(struct trace_seq *s)
-{
-       s->len = 0;
-       s->readpos = 0;
-}
-
  
  #define TRACE_PIPE_ALL_CPU     -1
  
-/*
- * Trace iterator - used by printout routines who present trace
- * results to users and which routines might sleep, etc:
- */
-struct trace_iterator {
-       struct trace_array      *tr;
-       struct tracer           *trace;
-       void                    *private;
-       int                     cpu_file;
-       struct mutex            mutex;
-       struct ring_buffer_iter *buffer_iter[NR_CPUS];
-
-       /* The below is zeroed out in pipe_read */
-       struct trace_seq        seq;
-       struct trace_entry      *ent;
-       int                     cpu;
-       u64                     ts;
-
-       unsigned long           iter_flags;
-       loff_t                  pos;
-       long                    idx;
-
-       cpumask_var_t           started;
-};
-
  int tracer_init(struct tracer *t, struct trace_array *tr);
  int tracing_is_enabled(void);
  void trace_wake_up(void);
  void tracing_reset(struct trace_array *tr, int cpu);
  void tracing_reset_online_cpus(struct trace_array *tr);
+void tracing_reset_current(int cpu);
+void tracing_reset_current_online_cpus(void);
  int tracing_open_generic(struct inode *inode, struct file *filp);
+struct dentry *trace_create_file(const char *name,
+                                mode_t mode,
+                                struct dentry *parent,
+                                void *data,
+                                const struct file_operations *fops);
+
  struct dentry *tracing_init_dentry(void);
  void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
  
  struct ring_buffer_event;
  
  struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
-                                                   unsigned char type,
+                                                   int type,
                                                     unsigned long len,
                                                     unsigned long flags,
                                                     int pc);
@@ -484,14 +432,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
                                 struct ring_buffer_event *event,
                                 unsigned long flags, int pc);
  
-struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
-                                 unsigned long flags, int pc);
-void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
-                                       unsigned long flags, int pc);
-void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
-                                       unsigned long flags, int pc);
-
  struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
                                                 struct trace_array_cpu *data);
  
@@ -514,7 +454,6 @@ void tracing_sched_switch_trace(struct trace_array *tr,
                                 struct task_struct *prev,
                                 struct task_struct *next,
                                 unsigned long flags, int pc);
-void tracing_record_cmdline(struct task_struct *tsk);
  
  void tracing_sched_wakeup_trace(struct trace_array *tr,
                                 struct task_struct *wakee,
@@ -599,6 +538,8 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
                                                struct trace_array *tr);
  extern int trace_selftest_startup_branch(struct tracer *trace,
                                          struct trace_array *tr);
+extern int trace_selftest_startup_hw_branches(struct tracer *trace,
+                                             struct trace_array *tr);
  #endif /* CONFIG_FTRACE_STARTUP_TEST */
  
  extern void *head_page(struct trace_array_cpu *data);
@@ -613,6 +554,8 @@ extern unsigned long trace_flags;
  /* Standard output formatting function used for function return traces */
  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
  extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+extern enum print_line_t
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
  
  #ifdef CONFIG_DYNAMIC_FTRACE
  /* TODO: make this variable */
@@ -644,7 +587,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
         return 1;
  }
  #endif /* CONFIG_DYNAMIC_FTRACE */
-
  #else /* CONFIG_FUNCTION_GRAPH_TRACER */
  static inline enum print_line_t
  print_graph_function(struct trace_iterator *iter)
@@ -692,6 +634,7 @@ enum trace_iterator_flags {
         TRACE_ITER_LATENCY_FMT          = 0x40000,
         TRACE_ITER_GLOBAL_CLK           = 0x80000,
         TRACE_ITER_SLEEP_TIME           = 0x100000,
+       TRACE_ITER_GRAPH_TIME           = 0x200000,
  };
  
  /*
@@ -790,103 +733,113 @@ struct ftrace_event_field {
         char                    *type;
         int                     offset;
         int                     size;
+       int                     is_signed;
  };
  
-struct ftrace_event_call {
-       char                    *name;
-       char                    *system;
-       struct dentry           *dir;
-       int                     enabled;
-       int                     (*regfunc)(void);
-       void                    (*unregfunc)(void);
-       int                     id;
-       int                     (*raw_init)(void);
-       int                     (*show_format)(struct trace_seq *s);
-       int                     (*define_fields)(void);
-       struct list_head        fields;
+struct event_filter {
+       int                     n_preds;
         struct filter_pred      **preds;
-
-#ifdef CONFIG_EVENT_PROFILE
-       atomic_t        profile_count;
-       int             (*profile_enable)(struct ftrace_event_call *);
-       void            (*profile_disable)(struct ftrace_event_call *);
-#endif
+       char                    *filter_string;
  };
  
  struct event_subsystem {
         struct list_head        list;
         const char              *name;
         struct dentry           *entry;
-       struct filter_pred      **preds;
+       void                    *filter;
  };
  
-#define events_for_each(event)                                         \
-       for (event = __start_ftrace_events;                             \
-            (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-            event++)
-
-#define MAX_FILTER_PRED 8
-
  struct filter_pred;
  
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
+                                int val1, int val2);
  
  struct filter_pred {
         filter_pred_fn_t fn;
         u64 val;
-       char *str_val;
+       char str_val[MAX_FILTER_STR_VAL];
         int str_len;
         char *field_name;
         int offset;
         int not;
-       int or;
-       int compound;
-       int clear;
+       int op;
+       int pop_n;
  };
  
-int trace_define_field(struct ftrace_event_call *call, char *type,
-                      char *name, int offset, int size);
-extern void filter_free_pred(struct filter_pred *pred);
-extern void filter_print_preds(struct filter_pred **preds,
+extern void print_event_filter(struct ftrace_event_call *call,
                                struct trace_seq *s);
-extern int filter_parse(char **pbuf, struct filter_pred *pred);
-extern int filter_add_pred(struct ftrace_event_call *call,
-                          struct filter_pred *pred);
-extern void filter_free_preds(struct ftrace_event_call *call);
-extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
-extern void filter_free_subsystem_preds(struct event_subsystem *system);
-extern int filter_add_subsystem_pred(struct event_subsystem *system,
-                                    struct filter_pred *pred);
-
-void event_trace_printk(unsigned long ip, const char *fmt, ...);
-extern struct ftrace_event_call __start_ftrace_events[];
-extern struct ftrace_event_call __stop_ftrace_events[];
-
-#define for_each_event(event)                                          \
-       for (event = __start_ftrace_events;                             \
-            (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-            event++)
+extern int apply_event_filter(struct ftrace_event_call *call,
+                             char *filter_string);
+extern int apply_subsystem_event_filter(struct event_subsystem *system,
+                                       char *filter_string);
+extern void print_subsystem_event_filter(struct event_subsystem *system,
+                                        struct trace_seq *s);
+
+static inline int
+filter_check_discard(struct ftrace_event_call *call, void *rec,
+                    struct ring_buffer *buffer,
+                    struct ring_buffer_event *event)
+{
+       if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
+               ring_buffer_discard_commit(buffer, event);
+               return 1;
+       }
+
+       return 0;
+}
+
+#define DEFINE_COMPARISON_PRED(type)                                   \
+static int filter_pred_##type(struct filter_pred *pred, void *event,   \
+                             int val1, int val2)                       \
+{                                                                      \
+       type *addr = (type *)(event + pred->offset);                    \
+       type val = (type)pred->val;                                     \
+       int match = 0;                                                  \
+                                                                       \
+       switch (pred->op) {                                             \
+       case OP_LT:                                                     \
+               match = (*addr < val);                                  \
+               break;                                                  \
+       case OP_LE:                                                     \
+               match = (*addr <= val);                                 \
+               break;                                                  \
+       case OP_GT:                                                     \
+               match = (*addr > val);                                  \
+               break;                                                  \
+       case OP_GE:                                                     \
+               match = (*addr >= val);                                 \
+               break;                                                  \
+       default:                                                        \
+               break;                                                  \
+       }                                                               \
+                                                                       \
+       return match;                                                   \
+}
+
+#define DEFINE_EQUALITY_PRED(size)                                     \
+static int filter_pred_##size(struct filter_pred *pred, void *event,   \
+                             int val1, int val2)                       \
+{                                                                      \
+       u##size *addr = (u##size *)(event + pred->offset);              \
+       u##size val = (u##size)pred->val;                               \
+       int match;                                                      \
+                                                                       \
+       match = (val == *addr) ^ pred->not;                             \
+                                                                       \
+       return match;                                                   \
+}
+
+extern struct mutex event_mutex;
+extern struct list_head ftrace_events;
  
  extern const char *__start___trace_bprintk_fmt[];
  extern const char *__stop___trace_bprintk_fmt[];
  
-/*
- * The double __builtin_constant_p is because gcc will give us an error
- * if we try to allocate the static variable to fmt if it is not a
- * constant. Even with the outer if statement optimizing out.
- */
-#define event_trace_printk(ip, fmt, args...)                           \
-do {                                                                   \
-       __trace_printk_check_format(fmt, ##args);                       \
-       tracing_record_cmdline(current);                                \
-       if (__builtin_constant_p(fmt)) {                                \
-               static const char *trace_printk_fmt                     \
-                 __attribute__((section("__trace_printk_fmt"))) =      \
-                       __builtin_constant_p(fmt) ? fmt : NULL;         \
-                                                                       \
-               __trace_bprintk(ip, trace_printk_fmt, ##args);          \
-       } else                                                          \
-               __trace_printk(ip, fmt, ##args);                        \
-} while (0)
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)     \
+       extern struct ftrace_event_call event_##call;
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt)
+#include "trace_event_types.h"
  
  #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c

index 7a30fc4c36423fc29f7e4bb97985ccd9e5ca9540..a29ef23ffb47080d81c41950ac4385566b193058 100644 (file)
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -9,6 +9,7 @@
  #include <linux/debugfs.h>
  #include <linux/ftrace.h>
  #include <linux/kallsyms.h>
+#include <linux/time.h>
  
  #include "trace.h"
  #include "trace_output.h"
@@ -67,7 +68,7 @@ initcall_call_print_line(struct trace_iterator *iter)
         trace_assign_type(field, entry);
         call = &field->boot_call;
         ts = iter->ts;
-       nsec_rem = do_div(ts, 1000000000);
+       nsec_rem = do_div(ts, NSEC_PER_SEC);
  
         ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",
                         (unsigned long)ts, nsec_rem, call->func, call->caller);
@@ -92,7 +93,7 @@ initcall_ret_print_line(struct trace_iterator *iter)
         trace_assign_type(field, entry);
         init_ret = &field->boot_ret;
         ts = iter->ts;
-       nsec_rem = do_div(ts, 1000000000);
+       nsec_rem = do_div(ts, NSEC_PER_SEC);
  
         ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
                         "returned %d after %llu msecs\n",
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c

index 8333715e4066eed26e53ecda1077d69c317679d3..7a7a9fd249a9c7c158d136dfc998efc9e908f35c 100644 (file)
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -30,6 +30,7 @@ static struct trace_array *branch_tracer;
  static void
  probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
  {
+       struct ftrace_event_call *call = &event_branch;
         struct trace_array *tr = branch_tracer;
         struct ring_buffer_event *event;
         struct trace_branch *entry;
@@ -73,7 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
         entry->line = f->line;
         entry->correct = val == expect;
  
-       ring_buffer_unlock_commit(tr->buffer, event);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
  
   out:
         atomic_dec(&tr->data[cpu]->disabled);
@@ -271,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
         return 0;
  }
  
-static void *annotated_branch_stat_start(void)
+static void *annotated_branch_stat_start(struct tracer_stat *trace)
  {
         return __start_annotated_branch_profile;
  }
@@ -346,7 +348,7 @@ static int all_branch_stat_headers(struct seq_file *m)
         return 0;
  }
  
-static void *all_branch_stat_start(void)
+static void *all_branch_stat_start(struct tracer_stat *trace)
  {
         return __start_branch_profile;
  }
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c

index 22cba9970776cf4984cb93b3e3a7fc0c5687f912..5b5895afecfe425f5c917af29b13d29ff4acf918 100644 (file)
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -10,22 +10,30 @@
  int ftrace_profile_enable(int event_id)
  {
         struct ftrace_event_call *event;
+       int ret = -EINVAL;
  
-       for_each_event(event) {
-               if (event->id == event_id)
-                       return event->profile_enable(event);
+       mutex_lock(&event_mutex);
+       list_for_each_entry(event, &ftrace_events, list) {
+               if (event->id == event_id) {
+                       ret = event->profile_enable(event);
+                       break;
+               }
         }
+       mutex_unlock(&event_mutex);
  
-       return -EINVAL;
+       return ret;
  }
  
  void ftrace_profile_disable(int event_id)
  {
         struct ftrace_event_call *event;
  
-       for_each_event(event) {
-               if (event->id == event_id)
-                       return event->profile_disable(event);
+       mutex_lock(&event_mutex);
+       list_for_each_entry(event, &ftrace_events, list) {
+               if (event->id == event_id) {
+                       event->profile_disable(event);
+                       break;
+               }
         }
+       mutex_unlock(&event_mutex);
  }
-
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h

index fd78bee71dd7519e03d45d38e3de9e58579af234..5e32e375134d976183bb0ae5df79abda3f029a54 100644 (file)
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -57,7 +57,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
         TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
  );
  
-TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
+TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
         TRACE_STRUCT(
                 TRACE_FIELD(unsigned long, arg1, arg1)
                 TRACE_FIELD(unsigned long, arg2, arg2)
@@ -122,8 +122,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
  TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
         TRACE_STRUCT(
                 TRACE_FIELD(unsigned int, line, line)
-               TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func)
-               TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
+               TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
+                                   TRACE_FUNC_SIZE+1, func)
+               TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
+                                   TRACE_FUNC_SIZE+1, file)
                 TRACE_FIELD(char, correct, correct)
         ),
         TP_RAW_FMT("%u:%s:%s (%u)")
@@ -139,8 +141,8 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
  
  TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
         TRACE_STRUCT(
-               TRACE_FIELD(ktime_t, state_data.stamp, stamp)
-               TRACE_FIELD(ktime_t, state_data.end, end)
+               TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
+               TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
                 TRACE_FIELD(int, state_data.type, type)
                 TRACE_FIELD(int, state_data.state, state)
         ),
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c

index 576f4fa2af0da22cad87d1c20fb044d89c16a066..aa08be69a1b6c1fcc026359ece8613812ad1a492 100644 (file)
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -8,19 +8,25 @@
   *
   */
  
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
  #include <linux/debugfs.h>
  #include <linux/uaccess.h>
  #include <linux/module.h>
  #include <linux/ctype.h>
+#include <linux/delay.h>
  
  #include "trace_output.h"
  
  #define TRACE_SYSTEM "TRACE_SYSTEM"
  
-static DEFINE_MUTEX(event_mutex);
+DEFINE_MUTEX(event_mutex);
+
+LIST_HEAD(ftrace_events);
  
  int trace_define_field(struct ftrace_event_call *call, char *type,
-                      char *name, int offset, int size)
+                      char *name, int offset, int size, int is_signed)
  {
         struct ftrace_event_field *field;
  
@@ -38,6 +44,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
  
         field->offset = offset;
         field->size = size;
+       field->is_signed = is_signed;
         list_add(&field->link, &call->fields);
  
         return 0;
@@ -51,47 +58,94 @@ err:
  
         return -ENOMEM;
  }
+EXPORT_SYMBOL_GPL(trace_define_field);
  
-static void ftrace_clear_events(void)
-{
-       struct ftrace_event_call *call = (void *)__start_ftrace_events;
-
+#ifdef CONFIG_MODULES
  
-       while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
+static void trace_destroy_fields(struct ftrace_event_call *call)
+{
+       struct ftrace_event_field *field, *next;
  
-               if (call->enabled) {
-                       call->enabled = 0;
-                       call->unregfunc();
-               }
-               call++;
+       list_for_each_entry_safe(field, next, &call->fields, link) {
+               list_del(&field->link);
+               kfree(field->type);
+               kfree(field->name);
+               kfree(field);
         }
  }
  
+#endif /* CONFIG_MODULES */
+
  static void ftrace_event_enable_disable(struct ftrace_event_call *call,
                                         int enable)
  {
-
         switch (enable) {
         case 0:
                 if (call->enabled) {
                         call->enabled = 0;
+                       tracing_stop_cmdline_record();
                         call->unregfunc();
                 }
                 break;
         case 1:
                 if (!call->enabled) {
                         call->enabled = 1;
+                       tracing_start_cmdline_record();
                         call->regfunc();
                 }
                 break;
         }
  }
  
+static void ftrace_clear_events(void)
+{
+       struct ftrace_event_call *call;
+
+       mutex_lock(&event_mutex);
+       list_for_each_entry(call, &ftrace_events, list) {
+               ftrace_event_enable_disable(call, 0);
+       }
+       mutex_unlock(&event_mutex);
+}
+
+/*
+ * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
+ */
+static int __ftrace_set_clr_event(const char *match, const char *sub,
+                                 const char *event, int set)
+{
+       struct ftrace_event_call *call;
+       int ret = -EINVAL;
+
+       mutex_lock(&event_mutex);
+       list_for_each_entry(call, &ftrace_events, list) {
+
+               if (!call->name || !call->regfunc)
+                       continue;
+
+               if (match &&
+                   strcmp(match, call->name) != 0 &&
+                   strcmp(match, call->system) != 0)
+                       continue;
+
+               if (sub && strcmp(sub, call->system) != 0)
+                       continue;
+
+               if (event && strcmp(event, call->name) != 0)
+                       continue;
+
+               ftrace_event_enable_disable(call, set);
+
+               ret = 0;
+       }
+       mutex_unlock(&event_mutex);
+
+       return ret;
+}
+
  static int ftrace_set_clr_event(char *buf, int set)
  {
-       struct ftrace_event_call *call = __start_ftrace_events;
         char *event = NULL, *sub = NULL, *match;
-       int ret = -EINVAL;
  
         /*
          * The buf format can be <subsystem>:<event-name>
@@ -117,30 +171,24 @@ static int ftrace_set_clr_event(char *buf, int set)
                         event = NULL;
         }
  
-       mutex_lock(&event_mutex);
-       for_each_event(call) {
-
-               if (!call->name || !call->regfunc)
-                       continue;
-
-               if (match &&
-                   strcmp(match, call->name) != 0 &&
-                   strcmp(match, call->system) != 0)
-                       continue;
-
-               if (sub && strcmp(sub, call->system) != 0)
-                       continue;
-
-               if (event && strcmp(event, call->name) != 0)
-                       continue;
-
-               ftrace_event_enable_disable(call, set);
-
-               ret = 0;
-       }
-       mutex_unlock(&event_mutex);
+       return __ftrace_set_clr_event(match, sub, event, set);
+}
  
-       return ret;
+/**
+ * trace_set_clr_event - enable or disable an event
+ * @system: system name to match (NULL for any system)
+ * @event: event name to match (NULL for all events, within system)
+ * @set: 1 to enable, 0 to disable
+ *
+ * This is a way for other parts of the kernel to enable or disable
+ * event recording.
+ *
+ * Returns 0 on success, -EINVAL if the parameters do not match any
+ * registered events.
+ */
+int trace_set_clr_event(const char *system, const char *event, int set)
+{
+       return __ftrace_set_clr_event(NULL, system, event, set);
  }
  
  /* 128 should be much more than enough */
@@ -224,15 +272,17 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
  static void *
  t_next(struct seq_file *m, void *v, loff_t *pos)
  {
-       struct ftrace_event_call *call = m->private;
-       struct ftrace_event_call *next = call;
+       struct list_head *list = m->private;
+       struct ftrace_event_call *call;
  
         (*pos)++;
  
         for (;;) {
-               if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+               if (list == &ftrace_events)
                         return NULL;
  
+               call = list_entry(list, struct ftrace_event_call, list);
+
                 /*
                  * The ftrace subsystem is for showing formats only.
                  * They can not be enabled or disabled via the event files.
@@ -240,45 +290,51 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                 if (call->regfunc)
                         break;
  
-               call++;
-               next = call;
+               list = list->next;
         }
  
-       m->private = ++next;
+       m->private = list->next;
  
         return call;
  }
  
  static void *t_start(struct seq_file *m, loff_t *pos)
  {
+       mutex_lock(&event_mutex);
+       if (*pos == 0)
+               m->private = ftrace_events.next;
         return t_next(m, NULL, pos);
  }
  
  static void *
  s_next(struct seq_file *m, void *v, loff_t *pos)
  {
-       struct ftrace_event_call *call = m->private;
-       struct ftrace_event_call *next;
+       struct list_head *list = m->private;
+       struct ftrace_event_call *call;
  
         (*pos)++;
  
   retry:
-       if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+       if (list == &ftrace_events)
                 return NULL;
  
+       call = list_entry(list, struct ftrace_event_call, list);
+
         if (!call->enabled) {
-               call++;
+               list = list->next;
                 goto retry;
         }
  
-       next = call;
-       m->private = ++next;
+       m->private = list->next;
  
         return call;
  }
  
  static void *s_start(struct seq_file *m, loff_t *pos)
  {
+       mutex_lock(&event_mutex);
+       if (*pos == 0)
+               m->private = ftrace_events.next;
         return s_next(m, NULL, pos);
  }
  
@@ -295,12 +351,12 @@ static int t_show(struct seq_file *m, void *v)
  
  static void t_stop(struct seq_file *m, void *p)
  {
+       mutex_unlock(&event_mutex);
  }
  
  static int
  ftrace_event_seq_open(struct inode *inode, struct file *file)
  {
-       int ret;
         const struct seq_operations *seq_ops;
  
         if ((file->f_mode & FMODE_WRITE) &&
@@ -308,13 +364,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
                 ftrace_clear_events();
  
         seq_ops = inode->i_private;
-       ret = seq_open(file, seq_ops);
-       if (!ret) {
-               struct seq_file *m = file->private_data;
-
-               m->private = __start_ftrace_events;
-       }
-       return ret;
+       return seq_open(file, seq_ops);
  }
  
  static ssize_t
@@ -374,8 +424,93 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
         return cnt;
  }
  
+static ssize_t
+system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+                  loff_t *ppos)
+{
+       const char set_to_char[4] = { '?', '0', '1', 'X' };
+       const char *system = filp->private_data;
+       struct ftrace_event_call *call;
+       char buf[2];
+       int set = 0;
+       int ret;
+
+       mutex_lock(&event_mutex);
+       list_for_each_entry(call, &ftrace_events, list) {
+               if (!call->name || !call->regfunc)
+                       continue;
+
+               if (system && strcmp(call->system, system) != 0)
+                       continue;
+
+               /*
+                * We need to find out if all the events are set
+                * or if all events or cleared, or if we have
+                * a mixture.
+                */
+               set |= (1 << !!call->enabled);
+
+               /*
+                * If we have a mixture, no need to look further.
+                */
+               if (set == 3)
+                       break;
+       }
+       mutex_unlock(&event_mutex);
+
+       buf[0] = set_to_char[set];
+       buf[1] = '\n';
+
+       ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+
+       return ret;
+}
+
+static ssize_t
+system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
+                   loff_t *ppos)
+{
+       const char *system = filp->private_data;
+       unsigned long val;
+       char buf[64];
+       ssize_t ret;
+
+       if (cnt >= sizeof(buf))
+               return -EINVAL;
+
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+
+       buf[cnt] = 0;
+
+       ret = strict_strtoul(buf, 10, &val);
+       if (ret < 0)
+               return ret;
+
+       ret = tracing_update_buffers();
+       if (ret < 0)
+               return ret;
+
+       if (val != 0 && val != 1)
+               return -EINVAL;
+
+       ret = __ftrace_set_clr_event(NULL, system, NULL, val);
+       if (ret)
+               goto out;
+
+       ret = cnt;
+
+out:
+       *ppos += cnt;
+
+       return ret;
+}
+
+extern char *__bad_type_size(void);
+
  #undef FIELD
  #define FIELD(type, name)                                              \
+       sizeof(type) != sizeof(field.name) ? __bad_type_size() :        \
         #type, "common_" #name, offsetof(typeof(field), name),          \
                 sizeof(field.name)
  
@@ -391,7 +526,7 @@ static int trace_write_header(struct trace_seq *s)
                                 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
                                 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
                                 "\n",
-                               FIELD(unsigned char, type),
+                               FIELD(unsigned short, type),
                                 FIELD(unsigned char, flags),
                                 FIELD(unsigned char, preempt_count),
                                 FIELD(int, pid),
@@ -481,7 +616,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
  
         trace_seq_init(s);
  
-       filter_print_preds(call->preds, s);
+       print_event_filter(call, s);
         r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
  
         kfree(s);
@@ -494,38 +629,26 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
                    loff_t *ppos)
  {
         struct ftrace_event_call *call = filp->private_data;
-       char buf[64], *pbuf = buf;
-       struct filter_pred *pred;
+       char *buf;
         int err;
  
-       if (cnt >= sizeof(buf))
+       if (cnt >= PAGE_SIZE)
                 return -EINVAL;
  
-       if (copy_from_user(&buf, ubuf, cnt))
-               return -EFAULT;
-       buf[cnt] = '\0';
-
-       pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-       if (!pred)
+       buf = (char *)__get_free_page(GFP_TEMPORARY);
+       if (!buf)
                 return -ENOMEM;
  
-       err = filter_parse(&pbuf, pred);
-       if (err < 0) {
-               filter_free_pred(pred);
-               return err;
-       }
-
-       if (pred->clear) {
-               filter_free_preds(call);
-               filter_free_pred(pred);
-               return cnt;
+       if (copy_from_user(buf, ubuf, cnt)) {
+               free_page((unsigned long) buf);
+               return -EFAULT;
         }
+       buf[cnt] = '\0';
  
-       err = filter_add_pred(call, pred);
-       if (err < 0) {
-               filter_free_pred(pred);
+       err = apply_event_filter(call, buf);
+       free_page((unsigned long) buf);
+       if (err < 0)
                 return err;
-       }
  
         *ppos += cnt;
  
@@ -549,7 +672,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
  
         trace_seq_init(s);
  
-       filter_print_preds(system->preds, s);
+       print_subsystem_event_filter(system, s);
         r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
  
         kfree(s);
@@ -562,45 +685,56 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
                        loff_t *ppos)
  {
         struct event_subsystem *system = filp->private_data;
-       char buf[64], *pbuf = buf;
-       struct filter_pred *pred;
+       char *buf;
         int err;
  
-       if (cnt >= sizeof(buf))
+       if (cnt >= PAGE_SIZE)
                 return -EINVAL;
  
-       if (copy_from_user(&buf, ubuf, cnt))
-               return -EFAULT;
-       buf[cnt] = '\0';
-
-       pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-       if (!pred)
+       buf = (char *)__get_free_page(GFP_TEMPORARY);
+       if (!buf)
                 return -ENOMEM;
  
-       err = filter_parse(&pbuf, pred);
-       if (err < 0) {
-               filter_free_pred(pred);
-               return err;
-       }
-
-       if (pred->clear) {
-               filter_free_subsystem_preds(system);
-               filter_free_pred(pred);
-               return cnt;
+       if (copy_from_user(buf, ubuf, cnt)) {
+               free_page((unsigned long) buf);
+               return -EFAULT;
         }
+       buf[cnt] = '\0';
  
-       err = filter_add_subsystem_pred(system, pred);
-       if (err < 0) {
-               filter_free_subsystem_preds(system);
-               filter_free_pred(pred);
+       err = apply_subsystem_event_filter(system, buf);
+       free_page((unsigned long) buf);
+       if (err < 0)
                 return err;
-       }
  
         *ppos += cnt;
  
         return cnt;
  }
  
+static ssize_t
+show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+       int (*func)(struct trace_seq *s) = filp->private_data;
+       struct trace_seq *s;
+       int r;
+
+       if (*ppos)
+               return 0;
+
+       s = kmalloc(sizeof(*s), GFP_KERNEL);
+       if (!s)
+               return -ENOMEM;
+
+       trace_seq_init(s);
+
+       func(s);
+       r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+
+       kfree(s);
+
+       return r;
+}
+
  static const struct seq_operations show_event_seq_ops = {
         .start = t_start,
         .next = t_next,
@@ -658,6 +792,17 @@ static const struct file_operations ftrace_subsystem_filter_fops = {
         .write = subsystem_filter_write,
  };
  
+static const struct file_operations ftrace_system_enable_fops = {
+       .open = tracing_open_generic,
+       .read = system_enable_read,
+       .write = system_enable_write,
+};
+
+static const struct file_operations ftrace_show_header_fops = {
+       .open = tracing_open_generic,
+       .read = show_header,
+};
+
  static struct dentry *event_trace_events_dir(void)
  {
         static struct dentry *d_tracer;
@@ -684,6 +829,7 @@ static struct dentry *
  event_subsystem_dir(const char *name, struct dentry *d_events)
  {
         struct event_subsystem *system;
+       struct dentry *entry;
  
         /* First see if we did not already create this dir */
         list_for_each_entry(system, &event_subsystems, list) {
@@ -707,16 +853,46 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
                 return d_events;
         }
  
-       system->name = name;
+       system->name = kstrdup(name, GFP_KERNEL);
+       if (!system->name) {
+               debugfs_remove(system->entry);
+               kfree(system);
+               return d_events;
+       }
+
         list_add(&system->list, &event_subsystems);
  
-       system->preds = NULL;
+       system->filter = NULL;
+
+       system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
+       if (!system->filter) {
+               pr_warning("Could not allocate filter for subsystem "
+                          "'%s'\n", name);
+               return system->entry;
+       }
+
+       entry = debugfs_create_file("filter", 0644, system->entry, system,
+                                   &ftrace_subsystem_filter_fops);
+       if (!entry) {
+               kfree(system->filter);
+               system->filter = NULL;
+               pr_warning("Could not create debugfs "
+                          "'%s/filter' entry\n", name);
+       }
+
+       entry = trace_create_file("enable", 0644, system->entry,
+                                 (void *)system->name,
+                                 &ftrace_system_enable_fops);
  
         return system->entry;
  }
  
  static int
-event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
+event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
+                const struct file_operations *id,
+                const struct file_operations *enable,
+                const struct file_operations *filter,
+                const struct file_operations *format)
  {
         struct dentry *entry;
         int ret;
@@ -725,7 +901,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
          * If the trace point header did not define TRACE_SYSTEM
          * then the system would be called "TRACE_SYSTEM".
          */
-       if (strcmp(call->system, "TRACE_SYSTEM") != 0)
+       if (strcmp(call->system, TRACE_SYSTEM) != 0)
                 d_events = event_subsystem_dir(call->system, d_events);
  
         if (call->raw_init) {
@@ -744,21 +920,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
                 return -1;
         }
  
-       if (call->regfunc) {
-               entry = debugfs_create_file("enable", 0644, call->dir, call,
-                                           &ftrace_enable_fops);
-               if (!entry)
-                       pr_warning("Could not create debugfs "
-                                  "'%s/enable' entry\n", call->name);
-       }
+       if (call->regfunc)
+               entry = trace_create_file("enable", 0644, call->dir, call,
+                                         enable);
  
-       if (call->id) {
-               entry = debugfs_create_file("id", 0444, call->dir, call,
-                               &ftrace_event_id_fops);
-               if (!entry)
-                       pr_warning("Could not create debugfs '%s/id' entry\n",
-                                       call->name);
-       }
+       if (call->id)
+               entry = trace_create_file("id", 0444, call->dir, call,
+                                         id);
  
         if (call->define_fields) {
                 ret = call->define_fields();
@@ -767,32 +935,195 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
                                    " events/%s\n", call->name);
                         return ret;
                 }
-               entry = debugfs_create_file("filter", 0644, call->dir, call,
-                                           &ftrace_event_filter_fops);
-               if (!entry)
-                       pr_warning("Could not create debugfs "
-                                  "'%s/filter' entry\n", call->name);
+               entry = trace_create_file("filter", 0644, call->dir, call,
+                                         filter);
         }
  
         /* A trace may not want to export its format */
         if (!call->show_format)
                 return 0;
  
-       entry = debugfs_create_file("format", 0444, call->dir, call,
-                                   &ftrace_event_format_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'%s/format' entry\n", call->name);
+       entry = trace_create_file("format", 0444, call->dir, call,
+                                 format);
+
+       return 0;
+}
+
+#define for_each_event(event, start, end)                      \
+       for (event = start;                                     \
+            (unsigned long)event < (unsigned long)end;         \
+            event++)
+
+#ifdef CONFIG_MODULES
+
+static LIST_HEAD(ftrace_module_file_list);
+
+/*
+ * Modules must own their file_operations to keep up with
+ * reference counting.
+ */
+struct ftrace_module_file_ops {
+       struct list_head                list;
+       struct module                   *mod;
+       struct file_operations          id;
+       struct file_operations          enable;
+       struct file_operations          format;
+       struct file_operations          filter;
+};
+
+static struct ftrace_module_file_ops *
+trace_create_file_ops(struct module *mod)
+{
+       struct ftrace_module_file_ops *file_ops;
+
+       /*
+        * This is a bit of a PITA. To allow for correct reference
+        * counting, modules must "own" their file_operations.
+        * To do this, we allocate the file operations that will be
+        * used in the event directory.
+        */
+
+       file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL);
+       if (!file_ops)
+               return NULL;
+
+       file_ops->mod = mod;
+
+       file_ops->id = ftrace_event_id_fops;
+       file_ops->id.owner = mod;
+
+       file_ops->enable = ftrace_enable_fops;
+       file_ops->enable.owner = mod;
+
+       file_ops->filter = ftrace_event_filter_fops;
+       file_ops->filter.owner = mod;
+
+       file_ops->format = ftrace_event_format_fops;
+       file_ops->format.owner = mod;
+
+       list_add(&file_ops->list, &ftrace_module_file_list);
+
+       return file_ops;
+}
+
+static void trace_module_add_events(struct module *mod)
+{
+       struct ftrace_module_file_ops *file_ops = NULL;
+       struct ftrace_event_call *call, *start, *end;
+       struct dentry *d_events;
+
+       start = mod->trace_events;
+       end = mod->trace_events + mod->num_trace_events;
+
+       if (start == end)
+               return;
+
+       d_events = event_trace_events_dir();
+       if (!d_events)
+               return;
+
+       for_each_event(call, start, end) {
+               /* The linker may leave blanks */
+               if (!call->name)
+                       continue;
+
+               /*
+                * This module has events, create file ops for this module
+                * if not already done.
+                */
+               if (!file_ops) {
+                       file_ops = trace_create_file_ops(mod);
+                       if (!file_ops)
+                               return;
+               }
+               call->mod = mod;
+               list_add(&call->list, &ftrace_events);
+               event_create_dir(call, d_events,
+                                &file_ops->id, &file_ops->enable,
+                                &file_ops->filter, &file_ops->format);
+       }
+}
+
+static void trace_module_remove_events(struct module *mod)
+{
+       struct ftrace_module_file_ops *file_ops;
+       struct ftrace_event_call *call, *p;
+       bool found = false;
+
+       down_write(&trace_event_mutex);
+       list_for_each_entry_safe(call, p, &ftrace_events, list) {
+               if (call->mod == mod) {
+                       found = true;
+                       ftrace_event_enable_disable(call, 0);
+                       if (call->event)
+                               __unregister_ftrace_event(call->event);
+                       debugfs_remove_recursive(call->dir);
+                       list_del(&call->list);
+                       trace_destroy_fields(call);
+                       destroy_preds(call);
+               }
+       }
+
+       /* Now free the file_operations */
+       list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
+               if (file_ops->mod == mod)
+                       break;
+       }
+       if (&file_ops->list != &ftrace_module_file_list) {
+               list_del(&file_ops->list);
+               kfree(file_ops);
+       }
+
+       /*
+        * It is safest to reset the ring buffer if the module being unloaded
+        * registered any events.
+        */
+       if (found)
+               tracing_reset_current_online_cpus();
+       up_write(&trace_event_mutex);
+}
+
+static int trace_module_notify(struct notifier_block *self,
+                              unsigned long val, void *data)
+{
+       struct module *mod = data;
+
+       mutex_lock(&event_mutex);
+       switch (val) {
+       case MODULE_STATE_COMING:
+               trace_module_add_events(mod);
+               break;
+       case MODULE_STATE_GOING:
+               trace_module_remove_events(mod);
+               break;
+       }
+       mutex_unlock(&event_mutex);
  
         return 0;
  }
+#else
+static int trace_module_notify(struct notifier_block *self,
+                              unsigned long val, void *data)
+{
+       return 0;
+}
+#endif /* CONFIG_MODULES */
+
+struct notifier_block trace_module_nb = {
+       .notifier_call = trace_module_notify,
+       .priority = 0,
+};
+
+extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call __stop_ftrace_events[];
  
  static __init int event_trace_init(void)
  {
-       struct ftrace_event_call *call = __start_ftrace_events;
+       struct ftrace_event_call *call;
         struct dentry *d_tracer;
         struct dentry *entry;
         struct dentry *d_events;
+       int ret;
  
         d_tracer = tracing_init_dentry();
         if (!d_tracer)
@@ -816,13 +1147,243 @@ static __init int event_trace_init(void)
         if (!d_events)
                 return 0;
  
-       for_each_event(call) {
+       /* ring buffer internal formats */
+       trace_create_file("header_page", 0444, d_events,
+                         ring_buffer_print_page_header,
+                         &ftrace_show_header_fops);
+
+       trace_create_file("header_event", 0444, d_events,
+                         ring_buffer_print_entry_header,
+                         &ftrace_show_header_fops);
+
+       trace_create_file("enable", 0644, d_events,
+                         NULL, &ftrace_system_enable_fops);
+
+       for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
                 /* The linker may leave blanks */
                 if (!call->name)
                         continue;
-               event_create_dir(call, d_events);
+               list_add(&call->list, &ftrace_events);
+               event_create_dir(call, d_events, &ftrace_event_id_fops,
+                                &ftrace_enable_fops, &ftrace_event_filter_fops,
+                                &ftrace_event_format_fops);
         }
  
+       ret = register_module_notifier(&trace_module_nb);
+       if (ret)
+               pr_warning("Failed to register trace events module notifier\n");
+
         return 0;
  }
  fs_initcall(event_trace_init);
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+static DEFINE_SPINLOCK(test_spinlock);
+static DEFINE_SPINLOCK(test_spinlock_irq);
+static DEFINE_MUTEX(test_mutex);
+
+static __init void test_work(struct work_struct *dummy)
+{
+       spin_lock(&test_spinlock);
+       spin_lock_irq(&test_spinlock_irq);
+       udelay(1);
+       spin_unlock_irq(&test_spinlock_irq);
+       spin_unlock(&test_spinlock);
+
+       mutex_lock(&test_mutex);
+       msleep(1);
+       mutex_unlock(&test_mutex);
+}
+
+static __init int event_test_thread(void *unused)
+{
+       void *test_malloc;
+
+       test_malloc = kmalloc(1234, GFP_KERNEL);
+       if (!test_malloc)
+               pr_info("failed to kmalloc\n");
+
+       schedule_on_each_cpu(test_work);
+
+       kfree(test_malloc);
+
+       set_current_state(TASK_INTERRUPTIBLE);
+       while (!kthread_should_stop())
+               schedule();
+
+       return 0;
+}
+
+/*
+ * Do various things that may trigger events.
+ */
+static __init void event_test_stuff(void)
+{
+       struct task_struct *test_thread;
+
+       test_thread = kthread_run(event_test_thread, NULL, "test-events");
+       msleep(1);
+       kthread_stop(test_thread);
+}
+
+/*
+ * For every trace event defined, we will test each trace point separately,
+ * and then by groups, and finally all trace points.
+ */
+static __init void event_trace_self_tests(void)
+{
+       struct ftrace_event_call *call;
+       struct event_subsystem *system;
+       int ret;
+
+       pr_info("Running tests on trace events:\n");
+
+       list_for_each_entry(call, &ftrace_events, list) {
+
+               /* Only test those that have a regfunc */
+               if (!call->regfunc)
+                       continue;
+
+               pr_info("Testing event %s: ", call->name);
+
+               /*
+                * If an event is already enabled, someone is using
+                * it and the self test should not be on.
+                */
+               if (call->enabled) {
+                       pr_warning("Enabled event during self test!\n");
+                       WARN_ON_ONCE(1);
+                       continue;
+               }
+
+               ftrace_event_enable_disable(call, 1);
+               event_test_stuff();
+               ftrace_event_enable_disable(call, 0);
+
+               pr_cont("OK\n");
+       }
+
+       /* Now test at the sub system level */
+
+       pr_info("Running tests on trace event systems:\n");
+
+       list_for_each_entry(system, &event_subsystems, list) {
+
+               /* the ftrace system is special, skip it */
+               if (strcmp(system->name, "ftrace") == 0)
+                       continue;
+
+               pr_info("Testing event system %s: ", system->name);
+
+               ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1);
+               if (WARN_ON_ONCE(ret)) {
+                       pr_warning("error enabling system %s\n",
+                                  system->name);
+                       continue;
+               }
+
+               event_test_stuff();
+
+               ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
+               if (WARN_ON_ONCE(ret))
+                       pr_warning("error disabling system %s\n",
+                                  system->name);
+
+               pr_cont("OK\n");
+       }
+
+       /* Test with all events enabled */
+
+       pr_info("Running tests on all trace events:\n");
+       pr_info("Testing all events: ");
+
+       ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1);
+       if (WARN_ON_ONCE(ret)) {
+               pr_warning("error enabling all events\n");
+               return;
+       }
+
+       event_test_stuff();
+
+       /* reset sysname */
+       ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0);
+       if (WARN_ON_ONCE(ret)) {
+               pr_warning("error disabling all events\n");
+               return;
+       }
+
+       pr_cont("OK\n");
+}
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+static DEFINE_PER_CPU(atomic_t, test_event_disable);
+
+static void
+function_test_events_call(unsigned long ip, unsigned long parent_ip)
+{
+       struct ring_buffer_event *event;
+       struct ftrace_entry *entry;
+       unsigned long flags;
+       long disabled;
+       int resched;
+       int cpu;
+       int pc;
+
+       pc = preempt_count();
+       resched = ftrace_preempt_disable();
+       cpu = raw_smp_processor_id();
+       disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
+
+       if (disabled != 1)
+               goto out;
+
+       local_save_flags(flags);
+
+       event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry),
+                                                 flags, pc);
+       if (!event)
+               goto out;
+       entry   = ring_buffer_event_data(event);
+       entry->ip                       = ip;
+       entry->parent_ip                = parent_ip;
+
+       trace_nowake_buffer_unlock_commit(event, flags, pc);
+
+ out:
+       atomic_dec(&per_cpu(test_event_disable, cpu));
+       ftrace_preempt_enable(resched);
+}
+
+static struct ftrace_ops trace_ops __initdata  =
+{
+       .func = function_test_events_call,
+};
+
+static __init void event_trace_self_test_with_function(void)
+{
+       register_ftrace_function(&trace_ops);
+       pr_info("Running tests again, along with the function tracer\n");
+       event_trace_self_tests();
+       unregister_ftrace_function(&trace_ops);
+}
+#else
+static __init void event_trace_self_test_with_function(void)
+{
+}
+#endif
+
+static __init int event_trace_self_tests_init(void)
+{
+
+       event_trace_self_tests();
+
+       event_trace_self_test_with_function();
+
+       return 0;
+}
+
+late_initcall(event_trace_self_tests_init);
+
+#endif
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c

index e03cbf1e38f36b306f8eafd2ed2d79837b901a37..db6e54bdb596d90b07fef8fed03520157ecf4390 100644 (file)
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -22,119 +22,297 @@
  #include <linux/uaccess.h>
  #include <linux/module.h>
  #include <linux/ctype.h>
+#include <linux/mutex.h>
  
  #include "trace.h"
  #include "trace_output.h"
  
-static int filter_pred_64(struct filter_pred *pred, void *event)
+static DEFINE_MUTEX(filter_mutex);
+
+enum filter_op_ids
+{
+       OP_OR,
+       OP_AND,
+       OP_NE,
+       OP_EQ,
+       OP_LT,
+       OP_LE,
+       OP_GT,
+       OP_GE,
+       OP_NONE,
+       OP_OPEN_PAREN,
+};
+
+struct filter_op {
+       int id;
+       char *string;
+       int precedence;
+};
+
+static struct filter_op filter_ops[] = {
+       { OP_OR, "||", 1 },
+       { OP_AND, "&&", 2 },
+       { OP_NE, "!=", 4 },
+       { OP_EQ, "==", 4 },
+       { OP_LT, "<", 5 },
+       { OP_LE, "<=", 5 },
+       { OP_GT, ">", 5 },
+       { OP_GE, ">=", 5 },
+       { OP_NONE, "OP_NONE", 0 },
+       { OP_OPEN_PAREN, "(", 0 },
+};
+
+enum {
+       FILT_ERR_NONE,
+       FILT_ERR_INVALID_OP,
+       FILT_ERR_UNBALANCED_PAREN,
+       FILT_ERR_TOO_MANY_OPERANDS,
+       FILT_ERR_OPERAND_TOO_LONG,
+       FILT_ERR_FIELD_NOT_FOUND,
+       FILT_ERR_ILLEGAL_FIELD_OP,
+       FILT_ERR_ILLEGAL_INTVAL,
+       FILT_ERR_BAD_SUBSYS_FILTER,
+       FILT_ERR_TOO_MANY_PREDS,
+       FILT_ERR_MISSING_FIELD,
+       FILT_ERR_INVALID_FILTER,
+};
+
+static char *err_text[] = {
+       "No error",
+       "Invalid operator",
+       "Unbalanced parens",
+       "Too many operands",
+       "Operand too long",
+       "Field not found",
+       "Illegal operation for field type",
+       "Illegal integer value",
+       "Couldn't find or set field in one of a subsystem's events",
+       "Too many terms in predicate expression",
+       "Missing field name and/or value",
+       "Meaningless filter expression",
+};
+
+struct opstack_op {
+       int op;
+       struct list_head list;
+};
+
+struct postfix_elt {
+       int op;
+       char *operand;
+       struct list_head list;
+};
+
+struct filter_parse_state {
+       struct filter_op *ops;
+       struct list_head opstack;
+       struct list_head postfix;
+       int lasterr;
+       int lasterr_pos;
+
+       struct {
+               char *string;
+               unsigned int cnt;
+               unsigned int tail;
+       } infix;
+
+       struct {
+               char string[MAX_FILTER_STR_VAL];
+               int pos;
+               unsigned int tail;
+       } operand;
+};
+
+DEFINE_COMPARISON_PRED(s64);
+DEFINE_COMPARISON_PRED(u64);
+DEFINE_COMPARISON_PRED(s32);
+DEFINE_COMPARISON_PRED(u32);
+DEFINE_COMPARISON_PRED(s16);
+DEFINE_COMPARISON_PRED(u16);
+DEFINE_COMPARISON_PRED(s8);
+DEFINE_COMPARISON_PRED(u8);
+
+DEFINE_EQUALITY_PRED(64);
+DEFINE_EQUALITY_PRED(32);
+DEFINE_EQUALITY_PRED(16);
+DEFINE_EQUALITY_PRED(8);
+
+static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
+                          void *event __attribute((unused)),
+                          int val1, int val2)
+{
+       return val1 && val2;
+}
+
+static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
+                         void *event __attribute((unused)),
+                         int val1, int val2)
+{
+       return val1 || val2;
+}
+
+/* Filter predicate for fixed sized arrays of characters */
+static int filter_pred_string(struct filter_pred *pred, void *event,
+                             int val1, int val2)
  {
-       u64 *addr = (u64 *)(event + pred->offset);
-       u64 val = (u64)pred->val;
-       int match;
+       char *addr = (char *)(event + pred->offset);
+       int cmp, match;
+
+       cmp = strncmp(addr, pred->str_val, pred->str_len);
  
-       match = (val == *addr) ^ pred->not;
+       match = (!cmp) ^ pred->not;
  
         return match;
  }
  
-static int filter_pred_32(struct filter_pred *pred, void *event)
+/*
+ * Filter predicate for dynamic sized arrays of characters.
+ * These are implemented through a list of strings at the end
+ * of the entry.
+ * Also each of these strings have a field in the entry which
+ * contains its offset from the beginning of the entry.
+ * We have then first to get this field, dereference it
+ * and add it to the address of the entry, and at last we have
+ * the address of the string.
+ */
+static int filter_pred_strloc(struct filter_pred *pred, void *event,
+                             int val1, int val2)
  {
-       u32 *addr = (u32 *)(event + pred->offset);
-       u32 val = (u32)pred->val;
-       int match;
+       int str_loc = *(int *)(event + pred->offset);
+       char *addr = (char *)(event + str_loc);
+       int cmp, match;
+
+       cmp = strncmp(addr, pred->str_val, pred->str_len);
  
-       match = (val == *addr) ^ pred->not;
+       match = (!cmp) ^ pred->not;
  
         return match;
  }
  
-static int filter_pred_16(struct filter_pred *pred, void *event)
+static int filter_pred_none(struct filter_pred *pred, void *event,
+                           int val1, int val2)
+{
+       return 0;
+}
+
+/* return 1 if event matches, 0 otherwise (discard) */
+int filter_match_preds(struct ftrace_event_call *call, void *rec)
  {
-       u16 *addr = (u16 *)(event + pred->offset);
-       u16 val = (u16)pred->val;
-       int match;
+       struct event_filter *filter = call->filter;
+       int match, top = 0, val1 = 0, val2 = 0;
+       int stack[MAX_FILTER_PRED];
+       struct filter_pred *pred;
+       int i;
+
+       for (i = 0; i < filter->n_preds; i++) {
+               pred = filter->preds[i];
+               if (!pred->pop_n) {
+                       match = pred->fn(pred, rec, val1, val2);
+                       stack[top++] = match;
+                       continue;
+               }
+               if (pred->pop_n > top) {
+                       WARN_ON_ONCE(1);
+                       return 0;
+               }
+               val1 = stack[--top];
+               val2 = stack[--top];
+               match = pred->fn(pred, rec, val1, val2);
+               stack[top++] = match;
+       }
  
-       match = (val == *addr) ^ pred->not;
+       return stack[--top];
+}
+EXPORT_SYMBOL_GPL(filter_match_preds);
  
-       return match;
+static void parse_error(struct filter_parse_state *ps, int err, int pos)
+{
+       ps->lasterr = err;
+       ps->lasterr_pos = pos;
  }
  
-static int filter_pred_8(struct filter_pred *pred, void *event)
+static void remove_filter_string(struct event_filter *filter)
  {
-       u8 *addr = (u8 *)(event + pred->offset);
-       u8 val = (u8)pred->val;
-       int match;
+       kfree(filter->filter_string);
+       filter->filter_string = NULL;
+}
  
-       match = (val == *addr) ^ pred->not;
+static int replace_filter_string(struct event_filter *filter,
+                                char *filter_string)
+{
+       kfree(filter->filter_string);
+       filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
+       if (!filter->filter_string)
+               return -ENOMEM;
  
-       return match;
+       return 0;
  }
  
-static int filter_pred_string(struct filter_pred *pred, void *event)
+static int append_filter_string(struct event_filter *filter,
+                               char *string)
  {
-       char *addr = (char *)(event + pred->offset);
-       int cmp, match;
+       int newlen;
+       char *new_filter_string;
  
-       cmp = strncmp(addr, pred->str_val, pred->str_len);
+       BUG_ON(!filter->filter_string);
+       newlen = strlen(filter->filter_string) + strlen(string) + 1;
+       new_filter_string = kmalloc(newlen, GFP_KERNEL);
+       if (!new_filter_string)
+               return -ENOMEM;
  
-       match = (!cmp) ^ pred->not;
+       strcpy(new_filter_string, filter->filter_string);
+       strcat(new_filter_string, string);
+       kfree(filter->filter_string);
+       filter->filter_string = new_filter_string;
  
-       return match;
+       return 0;
  }
  
-/* return 1 if event matches, 0 otherwise (discard) */
-int filter_match_preds(struct ftrace_event_call *call, void *rec)
+static void append_filter_err(struct filter_parse_state *ps,
+                             struct event_filter *filter)
  {
-       int i, matched, and_failed = 0;
-       struct filter_pred *pred;
+       int pos = ps->lasterr_pos;
+       char *buf, *pbuf;
  
-       for (i = 0; i < MAX_FILTER_PRED; i++) {
-               if (call->preds[i]) {
-                       pred = call->preds[i];
-                       if (and_failed && !pred->or)
-                               continue;
-                       matched = pred->fn(pred, rec);
-                       if (!matched && !pred->or) {
-                               and_failed = 1;
-                               continue;
-                       } else if (matched && pred->or)
-                               return 1;
-               } else
-                       break;
-       }
+       buf = (char *)__get_free_page(GFP_TEMPORARY);
+       if (!buf)
+               return;
  
-       if (and_failed)
-               return 0;
+       append_filter_string(filter, "\n");
+       memset(buf, ' ', PAGE_SIZE);
+       if (pos > PAGE_SIZE - 128)
+               pos = 0;
+       buf[pos] = '^';
+       pbuf = &buf[pos] + 1;
  
-       return 1;
+       sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]);
+       append_filter_string(filter, buf);
+       free_page((unsigned long) buf);
  }
  
-void filter_print_preds(struct filter_pred **preds, struct trace_seq *s)
+void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
  {
-       char *field_name;
-       struct filter_pred *pred;
-       int i;
+       struct event_filter *filter = call->filter;
  
-       if (!preds) {
+       mutex_lock(&filter_mutex);
+       if (filter->filter_string)
+               trace_seq_printf(s, "%s\n", filter->filter_string);
+       else
                 trace_seq_printf(s, "none\n");
-               return;
-       }
+       mutex_unlock(&filter_mutex);
+}
  
-       for (i = 0; i < MAX_FILTER_PRED; i++) {
-               if (preds[i]) {
-                       pred = preds[i];
-                       field_name = pred->field_name;
-                       if (i)
-                               trace_seq_printf(s, pred->or ? "|| " : "&& ");
-                       trace_seq_printf(s, "%s ", field_name);
-                       trace_seq_printf(s, pred->not ? "!= " : "== ");
-                       if (pred->str_val)
-                               trace_seq_printf(s, "%s\n", pred->str_val);
-                       else
-                               trace_seq_printf(s, "%llu\n", pred->val);
-               } else
-                       break;
-       }
+void print_subsystem_event_filter(struct event_subsystem *system,
+                                 struct trace_seq *s)
+{
+       struct event_filter *filter = system->filter;
+
+       mutex_lock(&filter_mutex);
+       if (filter->filter_string)
+               trace_seq_printf(s, "%s\n", filter->filter_string);
+       else
+               trace_seq_printf(s, "none\n");
+       mutex_unlock(&filter_mutex);
  }
  
  static struct ftrace_event_field *
@@ -150,284 +328,828 @@ find_event_field(struct ftrace_event_call *call, char *name)
         return NULL;
  }
  
-void filter_free_pred(struct filter_pred *pred)
+static void filter_free_pred(struct filter_pred *pred)
  {
         if (!pred)
                 return;
  
         kfree(pred->field_name);
-       kfree(pred->str_val);
         kfree(pred);
  }
  
-void filter_free_preds(struct ftrace_event_call *call)
+static void filter_clear_pred(struct filter_pred *pred)
  {
-       int i;
+       kfree(pred->field_name);
+       pred->field_name = NULL;
+       pred->str_len = 0;
+}
  
-       if (call->preds) {
-               for (i = 0; i < MAX_FILTER_PRED; i++)
-                       filter_free_pred(call->preds[i]);
-               kfree(call->preds);
-               call->preds = NULL;
+static int filter_set_pred(struct filter_pred *dest,
+                          struct filter_pred *src,
+                          filter_pred_fn_t fn)
+{
+       *dest = *src;
+       if (src->field_name) {
+               dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
+               if (!dest->field_name)
+                       return -ENOMEM;
         }
+       dest->fn = fn;
+
+       return 0;
  }
  
-void filter_free_subsystem_preds(struct event_subsystem *system)
+static void filter_disable_preds(struct ftrace_event_call *call)
  {
-       struct ftrace_event_call *call = __start_ftrace_events;
+       struct event_filter *filter = call->filter;
         int i;
  
-       if (system->preds) {
-               for (i = 0; i < MAX_FILTER_PRED; i++)
-                       filter_free_pred(system->preds[i]);
-               kfree(system->preds);
-               system->preds = NULL;
-       }
+       call->filter_active = 0;
+       filter->n_preds = 0;
  
-       events_for_each(call) {
-               if (!call->name || !call->regfunc)
-                       continue;
+       for (i = 0; i < MAX_FILTER_PRED; i++)
+               filter->preds[i]->fn = filter_pred_none;
+}
+
+void destroy_preds(struct ftrace_event_call *call)
+{
+       struct event_filter *filter = call->filter;
+       int i;
  
-               if (!strcmp(call->system, system->name))
-                       filter_free_preds(call);
+       for (i = 0; i < MAX_FILTER_PRED; i++) {
+               if (filter->preds[i])
+                       filter_free_pred(filter->preds[i]);
         }
+       kfree(filter->preds);
+       kfree(filter);
+       call->filter = NULL;
  }
  
-static int __filter_add_pred(struct ftrace_event_call *call,
-                            struct filter_pred *pred)
+int init_preds(struct ftrace_event_call *call)
  {
+       struct event_filter *filter;
+       struct filter_pred *pred;
         int i;
  
-       if (call->preds && !pred->compound)
-               filter_free_preds(call);
+       filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+       if (!call->filter)
+               return -ENOMEM;
  
-       if (!call->preds) {
-               call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
-                                     GFP_KERNEL);
-               if (!call->preds)
-                       return -ENOMEM;
-       }
+       call->filter_active = 0;
+       filter->n_preds = 0;
+
+       filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
+       if (!filter->preds)
+               goto oom;
  
         for (i = 0; i < MAX_FILTER_PRED; i++) {
-               if (!call->preds[i]) {
-                       call->preds[i] = pred;
-                       return 0;
+               pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+               if (!pred)
+                       goto oom;
+               pred->fn = filter_pred_none;
+               filter->preds[i] = pred;
+       }
+
+       return 0;
+
+oom:
+       destroy_preds(call);
+
+       return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(init_preds);
+
+static void filter_free_subsystem_preds(struct event_subsystem *system)
+{
+       struct event_filter *filter = system->filter;
+       struct ftrace_event_call *call;
+       int i;
+
+       if (filter->n_preds) {
+               for (i = 0; i < filter->n_preds; i++)
+                       filter_free_pred(filter->preds[i]);
+               kfree(filter->preds);
+               filter->preds = NULL;
+               filter->n_preds = 0;
+       }
+
+       mutex_lock(&event_mutex);
+       list_for_each_entry(call, &ftrace_events, list) {
+               if (!call->define_fields)
+                       continue;
+
+               if (!strcmp(call->system, system->name)) {
+                       filter_disable_preds(call);
+                       remove_filter_string(call->filter);
                 }
         }
+       mutex_unlock(&event_mutex);
+}
+
+static int filter_add_pred_fn(struct filter_parse_state *ps,
+                             struct ftrace_event_call *call,
+                             struct filter_pred *pred,
+                             filter_pred_fn_t fn)
+{
+       struct event_filter *filter = call->filter;
+       int idx, err;
+
+       if (filter->n_preds == MAX_FILTER_PRED) {
+               parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+               return -ENOSPC;
+       }
+
+       idx = filter->n_preds;
+       filter_clear_pred(filter->preds[idx]);
+       err = filter_set_pred(filter->preds[idx], pred, fn);
+       if (err)
+               return err;
  
-       return -ENOSPC;
+       filter->n_preds++;
+       call->filter_active = 1;
+
+       return 0;
  }
  
+enum {
+       FILTER_STATIC_STRING = 1,
+       FILTER_DYN_STRING
+};
+
  static int is_string_field(const char *type)
  {
+       if (strstr(type, "__data_loc") && strstr(type, "char"))
+               return FILTER_DYN_STRING;
+
         if (strchr(type, '[') && strstr(type, "char"))
-               return 1;
+               return FILTER_STATIC_STRING;
  
         return 0;
  }
  
-int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
+static int is_legal_op(struct ftrace_event_field *field, int op)
  {
-       struct ftrace_event_field *field;
-
-       field = find_event_field(call, pred->field_name);
-       if (!field)
-               return -EINVAL;
+       if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE))
+               return 0;
  
-       pred->offset = field->offset;
+       return 1;
+}
  
-       if (is_string_field(field->type)) {
-               if (!pred->str_val)
-                       return -EINVAL;
-               pred->fn = filter_pred_string;
-               pred->str_len = field->size;
-               return __filter_add_pred(call, pred);
-       } else {
-               if (pred->str_val)
-                       return -EINVAL;
-       }
+static filter_pred_fn_t select_comparison_fn(int op, int field_size,
+                                            int field_is_signed)
+{
+       filter_pred_fn_t fn = NULL;
  
-       switch (field->size) {
+       switch (field_size) {
         case 8:
-               pred->fn = filter_pred_64;
+               if (op == OP_EQ || op == OP_NE)
+                       fn = filter_pred_64;
+               else if (field_is_signed)
+                       fn = filter_pred_s64;
+               else
+                       fn = filter_pred_u64;
                 break;
         case 4:
-               pred->fn = filter_pred_32;
+               if (op == OP_EQ || op == OP_NE)
+                       fn = filter_pred_32;
+               else if (field_is_signed)
+                       fn = filter_pred_s32;
+               else
+                       fn = filter_pred_u32;
                 break;
         case 2:
-               pred->fn = filter_pred_16;
+               if (op == OP_EQ || op == OP_NE)
+                       fn = filter_pred_16;
+               else if (field_is_signed)
+                       fn = filter_pred_s16;
+               else
+                       fn = filter_pred_u16;
                 break;
         case 1:
-               pred->fn = filter_pred_8;
+               if (op == OP_EQ || op == OP_NE)
+                       fn = filter_pred_8;
+               else if (field_is_signed)
+                       fn = filter_pred_s8;
+               else
+                       fn = filter_pred_u8;
                 break;
-       default:
-               return -EINVAL;
         }
  
-       return __filter_add_pred(call, pred);
+       return fn;
  }
  
-static struct filter_pred *copy_pred(struct filter_pred *pred)
+static int filter_add_pred(struct filter_parse_state *ps,
+                          struct ftrace_event_call *call,
+                          struct filter_pred *pred)
  {
-       struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL);
-       if (!new_pred)
-               return NULL;
+       struct ftrace_event_field *field;
+       filter_pred_fn_t fn;
+       unsigned long long val;
+       int string_type;
+
+       pred->fn = filter_pred_none;
+
+       if (pred->op == OP_AND) {
+               pred->pop_n = 2;
+               return filter_add_pred_fn(ps, call, pred, filter_pred_and);
+       } else if (pred->op == OP_OR) {
+               pred->pop_n = 2;
+               return filter_add_pred_fn(ps, call, pred, filter_pred_or);
+       }
+
+       field = find_event_field(call, pred->field_name);
+       if (!field) {
+               parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
+               return -EINVAL;
+       }
  
-       memcpy(new_pred, pred, sizeof(*pred));
+       pred->offset = field->offset;
  
-       if (pred->field_name) {
-               new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
-               if (!new_pred->field_name) {
-                       kfree(new_pred);
-                       return NULL;
-               }
+       if (!is_legal_op(field, pred->op)) {
+               parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0);
+               return -EINVAL;
         }
  
-       if (pred->str_val) {
-               new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL);
-               if (!new_pred->str_val) {
-                       filter_free_pred(new_pred);
-                       return NULL;
+       string_type = is_string_field(field->type);
+       if (string_type) {
+               if (string_type == FILTER_STATIC_STRING)
+                       fn = filter_pred_string;
+               else
+                       fn = filter_pred_strloc;
+               pred->str_len = field->size;
+               if (pred->op == OP_NE)
+                       pred->not = 1;
+               return filter_add_pred_fn(ps, call, pred, fn);
+       } else {
+               if (strict_strtoull(pred->str_val, 0, &val)) {
+                       parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
+                       return -EINVAL;
                 }
+               pred->val = val;
+       }
+
+       fn = select_comparison_fn(pred->op, field->size, field->is_signed);
+       if (!fn) {
+               parse_error(ps, FILT_ERR_INVALID_OP, 0);
+               return -EINVAL;
         }
  
-       return new_pred;
+       if (pred->op == OP_NE)
+               pred->not = 1;
+
+       return filter_add_pred_fn(ps, call, pred, fn);
  }
  
-int filter_add_subsystem_pred(struct event_subsystem *system,
-                             struct filter_pred *pred)
+static int filter_add_subsystem_pred(struct filter_parse_state *ps,
+                                    struct event_subsystem *system,
+                                    struct filter_pred *pred,
+                                    char *filter_string)
  {
-       struct ftrace_event_call *call = __start_ftrace_events;
-       struct filter_pred *event_pred;
-       int i;
-
-       if (system->preds && !pred->compound)
-               filter_free_subsystem_preds(system);
+       struct event_filter *filter = system->filter;
+       struct ftrace_event_call *call;
+       int err = 0;
  
-       if (!system->preds) {
-               system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
+       if (!filter->preds) {
+               filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
                                         GFP_KERNEL);
-               if (!system->preds)
+
+               if (!filter->preds)
                         return -ENOMEM;
         }
  
-       for (i = 0; i < MAX_FILTER_PRED; i++) {
-               if (!system->preds[i]) {
-                       system->preds[i] = pred;
-                       break;
-               }
+       if (filter->n_preds == MAX_FILTER_PRED) {
+               parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+               return -ENOSPC;
         }
  
-       if (i == MAX_FILTER_PRED)
-               return -ENOSPC;
+       filter->preds[filter->n_preds] = pred;
+       filter->n_preds++;
  
-       events_for_each(call) {
-               int err;
+       mutex_lock(&event_mutex);
+       list_for_each_entry(call, &ftrace_events, list) {
  
-               if (!call->name || !call->regfunc)
+               if (!call->define_fields)
                         continue;
  
                 if (strcmp(call->system, system->name))
                         continue;
  
-               if (!find_event_field(call, pred->field_name))
-                       continue;
+               err = filter_add_pred(ps, call, pred);
+               if (err) {
+                       mutex_unlock(&event_mutex);
+                       filter_free_subsystem_preds(system);
+                       parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+                       goto out;
+               }
+               replace_filter_string(call->filter, filter_string);
+       }
+       mutex_unlock(&event_mutex);
+out:
+       return err;
+}
  
-               event_pred = copy_pred(pred);
-               if (!event_pred)
-                       goto oom;
+static void parse_init(struct filter_parse_state *ps,
+                      struct filter_op *ops,
+                      char *infix_string)
+{
+       memset(ps, '\0', sizeof(*ps));
  
-               err = filter_add_pred(call, event_pred);
-               if (err)
-                       filter_free_pred(event_pred);
-               if (err == -ENOMEM)
-                       goto oom;
+       ps->infix.string = infix_string;
+       ps->infix.cnt = strlen(infix_string);
+       ps->ops = ops;
+
+       INIT_LIST_HEAD(&ps->opstack);
+       INIT_LIST_HEAD(&ps->postfix);
+}
+
+static char infix_next(struct filter_parse_state *ps)
+{
+       ps->infix.cnt--;
+
+       return ps->infix.string[ps->infix.tail++];
+}
+
+static char infix_peek(struct filter_parse_state *ps)
+{
+       if (ps->infix.tail == strlen(ps->infix.string))
+               return 0;
+
+       return ps->infix.string[ps->infix.tail];
+}
+
+static void infix_advance(struct filter_parse_state *ps)
+{
+       ps->infix.cnt--;
+       ps->infix.tail++;
+}
+
+static inline int is_precedence_lower(struct filter_parse_state *ps,
+                                     int a, int b)
+{
+       return ps->ops[a].precedence < ps->ops[b].precedence;
+}
+
+static inline int is_op_char(struct filter_parse_state *ps, char c)
+{
+       int i;
+
+       for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+               if (ps->ops[i].string[0] == c)
+                       return 1;
         }
  
         return 0;
+}
  
-oom:
-       system->preds[i] = NULL;
-       return -ENOMEM;
+static int infix_get_op(struct filter_parse_state *ps, char firstc)
+{
+       char nextc = infix_peek(ps);
+       char opstr[3];
+       int i;
+
+       opstr[0] = firstc;
+       opstr[1] = nextc;
+       opstr[2] = '\0';
+
+       for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+               if (!strcmp(opstr, ps->ops[i].string)) {
+                       infix_advance(ps);
+                       return ps->ops[i].id;
+               }
+       }
+
+       opstr[1] = '\0';
+
+       for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+               if (!strcmp(opstr, ps->ops[i].string))
+                       return ps->ops[i].id;
+       }
+
+       return OP_NONE;
  }
  
-int filter_parse(char **pbuf, struct filter_pred *pred)
+static inline void clear_operand_string(struct filter_parse_state *ps)
  {
-       char *tmp, *tok, *val_str = NULL;
-       int tok_n = 0;
+       memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL);
+       ps->operand.tail = 0;
+}
  
-       /* field ==/!= number, or/and field ==/!= number, number */
-       while ((tok = strsep(pbuf, " \n"))) {
-               if (tok_n == 0) {
-                       if (!strcmp(tok, "0")) {
-                               pred->clear = 1;
-                               return 0;
-                       } else if (!strcmp(tok, "&&")) {
-                               pred->or = 0;
-                               pred->compound = 1;
-                       } else if (!strcmp(tok, "||")) {
-                               pred->or = 1;
-                               pred->compound = 1;
-                       } else
-                               pred->field_name = tok;
-                       tok_n = 1;
+static inline int append_operand_char(struct filter_parse_state *ps, char c)
+{
+       if (ps->operand.tail == MAX_FILTER_STR_VAL - 1)
+               return -EINVAL;
+
+       ps->operand.string[ps->operand.tail++] = c;
+
+       return 0;
+}
+
+static int filter_opstack_push(struct filter_parse_state *ps, int op)
+{
+       struct opstack_op *opstack_op;
+
+       opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL);
+       if (!opstack_op)
+               return -ENOMEM;
+
+       opstack_op->op = op;
+       list_add(&opstack_op->list, &ps->opstack);
+
+       return 0;
+}
+
+static int filter_opstack_empty(struct filter_parse_state *ps)
+{
+       return list_empty(&ps->opstack);
+}
+
+static int filter_opstack_top(struct filter_parse_state *ps)
+{
+       struct opstack_op *opstack_op;
+
+       if (filter_opstack_empty(ps))
+               return OP_NONE;
+
+       opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+
+       return opstack_op->op;
+}
+
+static int filter_opstack_pop(struct filter_parse_state *ps)
+{
+       struct opstack_op *opstack_op;
+       int op;
+
+       if (filter_opstack_empty(ps))
+               return OP_NONE;
+
+       opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+       op = opstack_op->op;
+       list_del(&opstack_op->list);
+
+       kfree(opstack_op);
+
+       return op;
+}
+
+static void filter_opstack_clear(struct filter_parse_state *ps)
+{
+       while (!filter_opstack_empty(ps))
+               filter_opstack_pop(ps);
+}
+
+static char *curr_operand(struct filter_parse_state *ps)
+{
+       return ps->operand.string;
+}
+
+static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
+{
+       struct postfix_elt *elt;
+
+       elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+       if (!elt)
+               return -ENOMEM;
+
+       elt->op = OP_NONE;
+       elt->operand = kstrdup(operand, GFP_KERNEL);
+       if (!elt->operand) {
+               kfree(elt);
+               return -ENOMEM;
+       }
+
+       list_add_tail(&elt->list, &ps->postfix);
+
+       return 0;
+}
+
+static int postfix_append_op(struct filter_parse_state *ps, int op)
+{
+       struct postfix_elt *elt;
+
+       elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+       if (!elt)
+               return -ENOMEM;
+
+       elt->op = op;
+       elt->operand = NULL;
+
+       list_add_tail(&elt->list, &ps->postfix);
+
+       return 0;
+}
+
+static void postfix_clear(struct filter_parse_state *ps)
+{
+       struct postfix_elt *elt;
+
+       while (!list_empty(&ps->postfix)) {
+               elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
+               kfree(elt->operand);
+               list_del(&elt->list);
+       }
+}
+
+static int filter_parse(struct filter_parse_state *ps)
+{
+       int in_string = 0;
+       int op, top_op;
+       char ch;
+
+       while ((ch = infix_next(ps))) {
+               if (ch == '"') {
+                       in_string ^= 1;
                         continue;
                 }
-               if (tok_n == 1) {
-                       if (!pred->field_name)
-                               pred->field_name = tok;
-                       else if (!strcmp(tok, "!="))
-                               pred->not = 1;
-                       else if (!strcmp(tok, "=="))
-                               pred->not = 0;
-                       else {
-                               pred->field_name = NULL;
+
+               if (in_string)
+                       goto parse_operand;
+
+               if (isspace(ch))
+                       continue;
+
+               if (is_op_char(ps, ch)) {
+                       op = infix_get_op(ps, ch);
+                       if (op == OP_NONE) {
+                               parse_error(ps, FILT_ERR_INVALID_OP, 0);
                                 return -EINVAL;
                         }
-                       tok_n = 2;
+
+                       if (strlen(curr_operand(ps))) {
+                               postfix_append_operand(ps, curr_operand(ps));
+                               clear_operand_string(ps);
+                       }
+
+                       while (!filter_opstack_empty(ps)) {
+                               top_op = filter_opstack_top(ps);
+                               if (!is_precedence_lower(ps, top_op, op)) {
+                                       top_op = filter_opstack_pop(ps);
+                                       postfix_append_op(ps, top_op);
+                                       continue;
+                               }
+                               break;
+                       }
+
+                       filter_opstack_push(ps, op);
                         continue;
                 }
-               if (tok_n == 2) {
-                       if (pred->compound) {
-                               if (!strcmp(tok, "!="))
-                                       pred->not = 1;
-                               else if (!strcmp(tok, "=="))
-                                       pred->not = 0;
-                               else {
-                                       pred->field_name = NULL;
-                                       return -EINVAL;
-                               }
-                       } else {
-                               val_str = tok;
-                               break; /* done */
+
+               if (ch == '(') {
+                       filter_opstack_push(ps, OP_OPEN_PAREN);
+                       continue;
+               }
+
+               if (ch == ')') {
+                       if (strlen(curr_operand(ps))) {
+                               postfix_append_operand(ps, curr_operand(ps));
+                               clear_operand_string(ps);
+                       }
+
+                       top_op = filter_opstack_pop(ps);
+                       while (top_op != OP_NONE) {
+                               if (top_op == OP_OPEN_PAREN)
+                                       break;
+                               postfix_append_op(ps, top_op);
+                               top_op = filter_opstack_pop(ps);
+                       }
+                       if (top_op == OP_NONE) {
+                               parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+                               return -EINVAL;
                         }
-                       tok_n = 3;
                         continue;
                 }
-               if (tok_n == 3) {
-                       val_str = tok;
-                       break; /* done */
+parse_operand:
+               if (append_operand_char(ps, ch)) {
+                       parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
+                       return -EINVAL;
                 }
         }
  
-       if (!val_str) {
-               pred->field_name = NULL;
-               return -EINVAL;
+       if (strlen(curr_operand(ps)))
+               postfix_append_operand(ps, curr_operand(ps));
+
+       while (!filter_opstack_empty(ps)) {
+               top_op = filter_opstack_pop(ps);
+               if (top_op == OP_NONE)
+                       break;
+               if (top_op == OP_OPEN_PAREN) {
+                       parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+                       return -EINVAL;
+               }
+               postfix_append_op(ps, top_op);
         }
  
-       pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
-       if (!pred->field_name)
-               return -ENOMEM;
+       return 0;
+}
  
-       pred->val = simple_strtoull(val_str, &tmp, 0);
-       if (tmp == val_str) {
-               pred->str_val = kstrdup(val_str, GFP_KERNEL);
-               if (!pred->str_val)
-                       return -ENOMEM;
-       } else if (*tmp != '\0')
+static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
+{
+       struct filter_pred *pred;
+
+       pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+       if (!pred)
+               return NULL;
+
+       pred->field_name = kstrdup(operand1, GFP_KERNEL);
+       if (!pred->field_name) {
+               kfree(pred);
+               return NULL;
+       }
+
+       strcpy(pred->str_val, operand2);
+       pred->str_len = strlen(operand2);
+
+       pred->op = op;
+
+       return pred;
+}
+
+static struct filter_pred *create_logical_pred(int op)
+{
+       struct filter_pred *pred;
+
+       pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+       if (!pred)
+               return NULL;
+
+       pred->op = op;
+
+       return pred;
+}
+
+static int check_preds(struct filter_parse_state *ps)
+{
+       int n_normal_preds = 0, n_logical_preds = 0;
+       struct postfix_elt *elt;
+
+       list_for_each_entry(elt, &ps->postfix, list) {
+               if (elt->op == OP_NONE)
+                       continue;
+
+               if (elt->op == OP_AND || elt->op == OP_OR) {
+                       n_logical_preds++;
+                       continue;
+               }
+               n_normal_preds++;
+       }
+
+       if (!n_normal_preds || n_logical_preds >= n_normal_preds) {
+               parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
                 return -EINVAL;
+       }
  
         return 0;
  }
  
+static int replace_preds(struct event_subsystem *system,
+                        struct ftrace_event_call *call,
+                        struct filter_parse_state *ps,
+                        char *filter_string)
+{
+       char *operand1 = NULL, *operand2 = NULL;
+       struct filter_pred *pred;
+       struct postfix_elt *elt;
+       int err;
+
+       err = check_preds(ps);
+       if (err)
+               return err;
+
+       list_for_each_entry(elt, &ps->postfix, list) {
+               if (elt->op == OP_NONE) {
+                       if (!operand1)
+                               operand1 = elt->operand;
+                       else if (!operand2)
+                               operand2 = elt->operand;
+                       else {
+                               parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
+                               return -EINVAL;
+                       }
+                       continue;
+               }
+
+               if (elt->op == OP_AND || elt->op == OP_OR) {
+                       pred = create_logical_pred(elt->op);
+                       if (call) {
+                               err = filter_add_pred(ps, call, pred);
+                               filter_free_pred(pred);
+                       } else
+                               err = filter_add_subsystem_pred(ps, system,
+                                                       pred, filter_string);
+                       if (err)
+                               return err;
+
+                       operand1 = operand2 = NULL;
+                       continue;
+               }
+
+               if (!operand1 || !operand2) {
+                       parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
+                       return -EINVAL;
+               }
+
+               pred = create_pred(elt->op, operand1, operand2);
+               if (call) {
+                       err = filter_add_pred(ps, call, pred);
+                       filter_free_pred(pred);
+               } else
+                       err = filter_add_subsystem_pred(ps, system, pred,
+                                                       filter_string);
+               if (err)
+                       return err;
+
+               operand1 = operand2 = NULL;
+       }
+
+       return 0;
+}
+
+int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+{
+       int err;
+
+       struct filter_parse_state *ps;
+
+       mutex_lock(&filter_mutex);
+
+       if (!strcmp(strstrip(filter_string), "0")) {
+               filter_disable_preds(call);
+               remove_filter_string(call->filter);
+               mutex_unlock(&filter_mutex);
+               return 0;
+       }
+
+       err = -ENOMEM;
+       ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+       if (!ps)
+               goto out_unlock;
+
+       filter_disable_preds(call);
+       replace_filter_string(call->filter, filter_string);
+
+       parse_init(ps, filter_ops, filter_string);
+       err = filter_parse(ps);
+       if (err) {
+               append_filter_err(ps, call->filter);
+               goto out;
+       }
+
+       err = replace_preds(NULL, call, ps, filter_string);
+       if (err)
+               append_filter_err(ps, call->filter);
+
+out:
+       filter_opstack_clear(ps);
+       postfix_clear(ps);
+       kfree(ps);
+out_unlock:
+       mutex_unlock(&filter_mutex);
+
+       return err;
+}
+
+int apply_subsystem_event_filter(struct event_subsystem *system,
+                                char *filter_string)
+{
+       int err;
+
+       struct filter_parse_state *ps;
+
+       mutex_lock(&filter_mutex);
+
+       if (!strcmp(strstrip(filter_string), "0")) {
+               filter_free_subsystem_preds(system);
+               remove_filter_string(system->filter);
+               mutex_unlock(&filter_mutex);
+               return 0;
+       }
+
+       err = -ENOMEM;
+       ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+       if (!ps)
+               goto out_unlock;
+
+       filter_free_subsystem_preds(system);
+       replace_filter_string(system->filter, filter_string);
+
+       parse_init(ps, filter_ops, filter_string);
+       err = filter_parse(ps);
+       if (err) {
+               append_filter_err(ps, system->filter);
+               goto out;
+       }
+
+       err = replace_preds(system, NULL, ps, filter_string);
+       if (err)
+               append_filter_err(ps, system->filter);
+
+out:
+       filter_opstack_clear(ps);
+       postfix_clear(ps);
+       kfree(ps);
+out_unlock:
+       mutex_unlock(&filter_mutex);
+
+       return err;
+}
  
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h

deleted file mode 100644 (file)

index 38985f9..0000000
--- a/kernel/trace/trace_events_stage_1.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Stage 1 of the trace events.
- *
- * Override the macros in <trace/trace_event_types.h> to include the following:
- *
- * struct ftrace_raw_<call> {
- *     struct trace_entry              ent;
- *     <type>                          <item>;
- *     <type2>                         <item2>[<len>];
- *     [...]
- * };
- *
- * The <type> <item> is created by the __field(type, item) macro or
- * the __array(type2, item2, len) macro.
- * We simply do "type item;", and that will create the fields
- * in the structure.
- */
-
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)
-
-#undef __array
-#define __array(type, item, len)       type    item[len];
-
-#undef __field
-#define __field(type, item)            type    item;
-
-#undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
-       struct ftrace_raw_##name {                              \
-               struct trace_entry      ent;                    \
-               tstruct                                         \
-       };                                                      \
-       static struct ftrace_event_call event_##name
-
-#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h

deleted file mode 100644 (file)

index d363c66..0000000
--- a/kernel/trace/trace_events_stage_2.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Stage 2 of the trace events.
- *
- * Override the macros in <trace/trace_event_types.h> to include the following:
- *
- * enum print_line_t
- * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
- * {
- *     struct trace_seq *s = &iter->seq;
- *     struct ftrace_raw_<call> *field; <-- defined in stage 1
- *     struct trace_entry *entry;
- *     int ret;
- *
- *     entry = iter->ent;
- *
- *     if (entry->type != event_<call>.id) {
- *             WARN_ON_ONCE(1);
- *             return TRACE_TYPE_UNHANDLED;
- *     }
- *
- *     field = (typeof(field))entry;
- *
- *     ret = trace_seq_printf(s, <TP_printk> "\n");
- *     if (!ret)
- *             return TRACE_TYPE_PARTIAL_LINE;
- *
- *     return TRACE_TYPE_HANDLED;
- * }
- *
- * This is the method used to print the raw event to the trace
- * output format. Note, this is not needed if the data is read
- * in binary.
- */
-
-#undef __entry
-#define __entry field
-
-#undef TP_printk
-#define TP_printk(fmt, args...) fmt "\n", args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
-enum print_line_t                                                      \
-ftrace_raw_output_##call(struct trace_iterator *iter, int flags)       \
-{                                                                      \
-       struct trace_seq *s = &iter->seq;                               \
-       struct ftrace_raw_##call *field;                                \
-       struct trace_entry *entry;                                      \
-       int ret;                                                        \
-                                                                       \
-       entry = iter->ent;                                              \
-                                                                       \
-       if (entry->type != event_##call.id) {                           \
-               WARN_ON_ONCE(1);                                        \
-               return TRACE_TYPE_UNHANDLED;                            \
-       }                                                               \
-                                                                       \
-       field = (typeof(field))entry;                                   \
-                                                                       \
-       ret = trace_seq_printf(s, #call ": " print);                    \
-       if (!ret)                                                       \
-               return TRACE_TYPE_PARTIAL_LINE;                         \
-                                                                       \
-       return TRACE_TYPE_HANDLED;                                      \
-}
-       
-#include <trace/trace_event_types.h>
-
-/*
- * Setup the showing format of trace point.
- *
- * int
- * ftrace_format_##call(struct trace_seq *s)
- * {
- *     struct ftrace_raw_##call field;
- *     int ret;
- *
- *     ret = trace_seq_printf(s, #type " " #item ";"
- *                            " offset:%u; size:%u;\n",
- *                            offsetof(struct ftrace_raw_##call, item),
- *                            sizeof(field.type));
- *
- * }
- */
-
-#undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
-
-#undef __field
-#define __field(type, item)                                    \
-       ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                              "offset:%u;\tsize:%u;\n",                \
-                              (unsigned int)offsetof(typeof(field), item), \
-                              (unsigned int)sizeof(field.item));       \
-       if (!ret)                                                       \
-               return 0;
-
-#undef __array
-#define __array(type, item, len)                                               \
-       ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"    \
-                              "offset:%u;\tsize:%u;\n",                \
-                              (unsigned int)offsetof(typeof(field), item), \
-                              (unsigned int)sizeof(field.item));       \
-       if (!ret)                                                       \
-               return 0;
-
-#undef __entry
-#define __entry REC
-
-#undef TP_printk
-#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
-
-#undef TP_fast_assign
-#define TP_fast_assign(args...) args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)           \
-static int                                                             \
-ftrace_format_##call(struct trace_seq *s)                              \
-{                                                                      \
-       struct ftrace_raw_##call field;                                 \
-       int ret;                                                        \
-                                                                       \
-       tstruct;                                                        \
-                                                                       \
-       trace_seq_printf(s, "\nprint fmt: " print);                     \
-                                                                       \
-       return ret;                                                     \
-}
-
-#include <trace/trace_event_types.h>
-
-#undef __field
-#define __field(type, item)                                            \
-       ret = trace_define_field(event_call, #type, #item,              \
-                                offsetof(typeof(field), item),         \
-                                sizeof(field.item));                   \
-       if (ret)                                                        \
-               return ret;
-
-#undef __array
-#define __array(type, item, len)                                       \
-       ret = trace_define_field(event_call, #type "[" #len "]", #item, \
-                                offsetof(typeof(field), item),         \
-                                sizeof(field.item));                   \
-       if (ret)                                                        \
-               return ret;
-
-#define __common_field(type, item)                                     \
-       ret = trace_define_field(event_call, #type, "common_" #item,    \
-                                offsetof(typeof(field.ent), item),     \
-                                sizeof(field.ent.item));               \
-       if (ret)                                                        \
-               return ret;
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)           \
-int                                                                    \
-ftrace_define_fields_##call(void)                                      \
-{                                                                      \
-       struct ftrace_raw_##call field;                                 \
-       struct ftrace_event_call *event_call = &event_##call;           \
-       int ret;                                                        \
-                                                                       \
-       __common_field(unsigned char, type);                            \
-       __common_field(unsigned char, flags);                           \
-       __common_field(unsigned char, preempt_count);                   \
-       __common_field(int, pid);                                       \
-       __common_field(int, tgid);                                      \
-                                                                       \
-       tstruct;                                                        \
-                                                                       \
-       return ret;                                                     \
-}
-
-#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h

deleted file mode 100644 (file)

index 9d2fa78..0000000
--- a/kernel/trace/trace_events_stage_3.h
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Stage 3 of the trace events.
- *
- * Override the macros in <trace/trace_event_types.h> to include the following:
- *
- * static void ftrace_event_<call>(proto)
- * {
- *     event_trace_printk(_RET_IP_, "<call>: " <fmt>);
- * }
- *
- * static int ftrace_reg_event_<call>(void)
- * {
- *     int ret;
- *
- *     ret = register_trace_<call>(ftrace_event_<call>);
- *     if (!ret)
- *             pr_info("event trace: Could not activate trace point "
- *                     "probe to  <call>");
- *     return ret;
- * }
- *
- * static void ftrace_unreg_event_<call>(void)
- * {
- *     unregister_trace_<call>(ftrace_event_<call>);
- * }
- *
- * For those macros defined with TRACE_FORMAT:
- *
- * static struct ftrace_event_call __used
- * __attribute__((__aligned__(4)))
- * __attribute__((section("_ftrace_events"))) event_<call> = {
- *     .name                   = "<call>",
- *     .regfunc                = ftrace_reg_event_<call>,
- *     .unregfunc              = ftrace_unreg_event_<call>,
- * }
- *
- *
- * For those macros defined with TRACE_EVENT:
- *
- * static struct ftrace_event_call event_<call>;
- *
- * static void ftrace_raw_event_<call>(proto)
- * {
- *     struct ring_buffer_event *event;
- *     struct ftrace_raw_<call> *entry; <-- defined in stage 1
- *     unsigned long irq_flags;
- *     int pc;
- *
- *     local_save_flags(irq_flags);
- *     pc = preempt_count();
- *
- *     event = trace_current_buffer_lock_reserve(event_<call>.id,
- *                               sizeof(struct ftrace_raw_<call>),
- *                               irq_flags, pc);
- *     if (!event)
- *             return;
- *     entry   = ring_buffer_event_data(event);
- *
- *     <assign>;  <-- Here we assign the entries by the __field and
- *                     __array macros.
- *
- *     trace_current_buffer_unlock_commit(event, irq_flags, pc);
- * }
- *
- * static int ftrace_raw_reg_event_<call>(void)
- * {
- *     int ret;
- *
- *     ret = register_trace_<call>(ftrace_raw_event_<call>);
- *     if (!ret)
- *             pr_info("event trace: Could not activate trace point "
- *                     "probe to <call>");
- *     return ret;
- * }
- *
- * static void ftrace_unreg_event_<call>(void)
- * {
- *     unregister_trace_<call>(ftrace_raw_event_<call>);
- * }
- *
- * static struct trace_event ftrace_event_type_<call> = {
- *     .trace                  = ftrace_raw_output_<call>, <-- stage 2
- * };
- *
- * static int ftrace_raw_init_event_<call>(void)
- * {
- *     int id;
- *
- *     id = register_ftrace_event(&ftrace_event_type_<call>);
- *     if (!id)
- *             return -ENODEV;
- *     event_<call>.id = id;
- *     return 0;
- * }
- *
- * static struct ftrace_event_call __used
- * __attribute__((__aligned__(4)))
- * __attribute__((section("_ftrace_events"))) event_<call> = {
- *     .name                   = "<call>",
- *     .system                 = "<system>",
- *     .raw_init               = ftrace_raw_init_event_<call>,
- *     .regfunc                = ftrace_reg_event_<call>,
- *     .unregfunc              = ftrace_unreg_event_<call>,
- *     .show_format            = ftrace_format_<call>,
- * }
- *
- */
-
-#undef TP_FMT
-#define TP_FMT(fmt, args...)   fmt "\n", ##args
-
-#ifdef CONFIG_EVENT_PROFILE
-#define _TRACE_PROFILE(call, proto, args)                              \
-static void ftrace_profile_##call(proto)                               \
-{                                                                      \
-       extern void perf_tpcounter_event(int);                          \
-       perf_tpcounter_event(event_##call.id);                          \
-}                                                                      \
-                                                                       \
-static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
-{                                                                      \
-       int ret = 0;                                                    \
-                                                                       \
-       if (!atomic_inc_return(&call->profile_count))                   \
-               ret = register_trace_##call(ftrace_profile_##call);     \
-                                                                       \
-       return ret;                                                     \
-}                                                                      \
-                                                                       \
-static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
-{                                                                      \
-       if (atomic_add_negative(-1, &call->profile_count))              \
-               unregister_trace_##call(ftrace_profile_##call);         \
-}
-
-#define _TRACE_PROFILE_INIT(call)                                      \
-       .profile_count = ATOMIC_INIT(-1),                               \
-       .profile_enable = ftrace_profile_enable_##call,                 \
-       .profile_disable = ftrace_profile_disable_##call,
-
-#else
-#define _TRACE_PROFILE(call, proto, args)
-#define _TRACE_PROFILE_INIT(call)
-#endif
-
-#define _TRACE_FORMAT(call, proto, args, fmt)                          \
-static void ftrace_event_##call(proto)                                 \
-{                                                                      \
-       event_trace_printk(_RET_IP_, #call ": " fmt);                   \
-}                                                                      \
-                                                                       \
-static int ftrace_reg_event_##call(void)                               \
-{                                                                      \
-       int ret;                                                        \
-                                                                       \
-       ret = register_trace_##call(ftrace_event_##call);               \
-       if (ret)                                                        \
-               pr_info("event trace: Could not activate trace point "  \
-                       "probe to " #call "\n");                        \
-       return ret;                                                     \
-}                                                                      \
-                                                                       \
-static void ftrace_unreg_event_##call(void)                            \
-{                                                                      \
-       unregister_trace_##call(ftrace_event_##call);                   \
-}                                                                      \
-                                                                       \
-static struct ftrace_event_call event_##call;                          \
-                                                                       \
-static int ftrace_init_event_##call(void)                              \
-{                                                                      \
-       int id;                                                         \
-                                                                       \
-       id = register_ftrace_event(NULL);                               \
-       if (!id)                                                        \
-               return -ENODEV;                                         \
-       event_##call.id = id;                                           \
-       return 0;                                                       \
-}
-
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)                           \
-_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))          \
-_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))                      \
-static struct ftrace_event_call __used                                 \
-__attribute__((__aligned__(4)))                                                \
-__attribute__((section("_ftrace_events"))) event_##call = {            \
-       .name                   = #call,                                \
-       .system                 = __stringify(TRACE_SYSTEM),            \
-       .raw_init               = ftrace_init_event_##call,             \
-       .regfunc                = ftrace_reg_event_##call,              \
-       .unregfunc              = ftrace_unreg_event_##call,            \
-       _TRACE_PROFILE_INIT(call)                                       \
-}
-
-#undef __entry
-#define __entry entry
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
-_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))                      \
-                                                                       \
-static struct ftrace_event_call event_##call;                          \
-                                                                       \
-static void ftrace_raw_event_##call(proto)                             \
-{                                                                      \
-       struct ftrace_event_call *call = &event_##call;                 \
-       struct ring_buffer_event *event;                                \
-       struct ftrace_raw_##call *entry;                                \
-       unsigned long irq_flags;                                        \
-       int pc;                                                         \
-                                                                       \
-       local_save_flags(irq_flags);                                    \
-       pc = preempt_count();                                           \
-                                                                       \
-       event = trace_current_buffer_lock_reserve(event_##call.id,      \
-                                 sizeof(struct ftrace_raw_##call),     \
-                                 irq_flags, pc);                       \
-       if (!event)                                                     \
-               return;                                                 \
-       entry   = ring_buffer_event_data(event);                        \
-                                                                       \
-       assign;                                                         \
-                                                                       \
-       if (call->preds && !filter_match_preds(call, entry))            \
-               ring_buffer_event_discard(event);                       \
-                                                                       \
-       trace_nowake_buffer_unlock_commit(event, irq_flags, pc);        \
-                                                                       \
-}                                                                      \
-                                                                       \
-static int ftrace_raw_reg_event_##call(void)                           \
-{                                                                      \
-       int ret;                                                        \
-                                                                       \
-       ret = register_trace_##call(ftrace_raw_event_##call);           \
-       if (ret)                                                        \
-               pr_info("event trace: Could not activate trace point "  \
-                       "probe to " #call "\n");                        \
-       return ret;                                                     \
-}                                                                      \
-                                                                       \
-static void ftrace_raw_unreg_event_##call(void)                                \
-{                                                                      \
-       unregister_trace_##call(ftrace_raw_event_##call);               \
-}                                                                      \
-                                                                       \
-static struct trace_event ftrace_event_type_##call = {                 \
-       .trace                  = ftrace_raw_output_##call,             \
-};                                                                     \
-                                                                       \
-static int ftrace_raw_init_event_##call(void)                          \
-{                                                                      \
-       int id;                                                         \
-                                                                       \
-       id = register_ftrace_event(&ftrace_event_type_##call);          \
-       if (!id)                                                        \
-               return -ENODEV;                                         \
-       event_##call.id = id;                                           \
-       INIT_LIST_HEAD(&event_##call.fields);                           \
-       return 0;                                                       \
-}                                                                      \
-                                                                       \
-static struct ftrace_event_call __used                                 \
-__attribute__((__aligned__(4)))                                                \
-__attribute__((section("_ftrace_events"))) event_##call = {            \
-       .name                   = #call,                                \
-       .system                 = __stringify(TRACE_SYSTEM),            \
-       .raw_init               = ftrace_raw_init_event_##call,         \
-       .regfunc                = ftrace_raw_reg_event_##call,          \
-       .unregfunc              = ftrace_raw_unreg_event_##call,        \
-       .show_format            = ftrace_format_##call,                 \
-       .define_fields          = ftrace_define_fields_##call,          \
-       _TRACE_PROFILE_INIT(call)                                       \
-}
-
-#include <trace/trace_event_types.h>
-
-#undef _TRACE_PROFILE
-#undef _TRACE_PROFILE_INIT
-
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c

index 07a22c33ebf3c31b4d07b7125a9062118ca90490..d06cf898dc86aeb012a5f2e5bb1abed214aa8102 100644 (file)
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -19,8 +19,12 @@
  #undef TRACE_STRUCT
  #define TRACE_STRUCT(args...) args
  
+extern void __bad_type_size(void);
+
  #undef TRACE_FIELD
  #define TRACE_FIELD(type, item, assign)                                        \
+       if (sizeof(type) != sizeof(field.item))                         \
+               __bad_type_size();                                      \
         ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
                                "offset:%u;\tsize:%u;\n",                \
                                (unsigned int)offsetof(typeof(field), item), \
@@ -30,7 +34,7 @@
  
  
  #undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd)                      \
+#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)                 \
         ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"   \
                                "offset:%u;\tsize:%u;\n",                \
                                (unsigned int)offsetof(typeof(field), item), \
@@ -46,6 +50,9 @@
         if (!ret)                                                       \
                 return 0;
  
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)        \
+       TRACE_FIELD(type, item, assign)
  
  #undef TP_RAW_FMT
  #define TP_RAW_FMT(args...) args
@@ -65,6 +72,22 @@ ftrace_format_##call(struct trace_seq *s)                            \
         return ret;                                                     \
  }
  
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,   \
+                                   tpfmt)                              \
+static int                                                             \
+ftrace_format_##call(struct trace_seq *s)                              \
+{                                                                      \
+       struct args field;                                              \
+       int ret;                                                        \
+                                                                       \
+       tstruct;                                                        \
+                                                                       \
+       trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);            \
+                                                                       \
+       return ret;                                                     \
+}
+
  #include "trace_event_types.h"
  
  #undef TRACE_ZERO_CHAR
@@ -78,6 +101,10 @@ ftrace_format_##call(struct trace_seq *s)                           \
  #define TRACE_FIELD(type, item, assign)\
         entry->item = assign;
  
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)        \
+       TRACE_FIELD(type, item, assign)
+
  #undef TP_CMD
  #define TP_CMD(cmd...) cmd
  
@@ -85,18 +112,95 @@ ftrace_format_##call(struct trace_seq *s)                          \
  #define TRACE_ENTRY    entry
  
  #undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
         cmd;
  
  #undef TRACE_EVENT_FORMAT
  #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)     \
+int ftrace_define_fields_##call(void);                                 \
+static int ftrace_raw_init_event_##call(void);                         \
+                                                                       \
+struct ftrace_event_call __used                                                \
+__attribute__((__aligned__(4)))                                                \
+__attribute__((section("_ftrace_events"))) event_##call = {            \
+       .name                   = #call,                                \
+       .id                     = proto,                                \
+       .system                 = __stringify(TRACE_SYSTEM),            \
+       .raw_init               = ftrace_raw_init_event_##call,         \
+       .show_format            = ftrace_format_##call,                 \
+       .define_fields          = ftrace_define_fields_##call,          \
+};                                                                     \
+static int ftrace_raw_init_event_##call(void)                          \
+{                                                                      \
+       INIT_LIST_HEAD(&event_##call.fields);                           \
+       init_preds(&event_##call);                                      \
+       return 0;                                                       \
+}                                                                      \
+
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,   \
+                                   tpfmt)                              \
                                                                         \
-static struct ftrace_event_call __used                                 \
+struct ftrace_event_call __used                                                \
  __attribute__((__aligned__(4)))                                                \
  __attribute__((section("_ftrace_events"))) event_##call = {            \
         .name                   = #call,                                \
         .id                     = proto,                                \
         .system                 = __stringify(TRACE_SYSTEM),            \
         .show_format            = ftrace_format_##call,                 \
+};
+
+#include "trace_event_types.h"
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)                                        \
+       ret = trace_define_field(event_call, #type, #item,              \
+                                offsetof(typeof(field), item),         \
+                                sizeof(field.item), is_signed_type(type));     \
+       if (ret)                                                        \
+               return ret;
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type, item, len, cmd)                      \
+       ret = trace_define_field(event_call, #type "[" #len "]", #item, \
+                                offsetof(typeof(field), item),         \
+                                sizeof(field.item), 0);                \
+       if (ret)                                                        \
+               return ret;
+
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)                        \
+       ret = trace_define_field(event_call, #type, #item,              \
+                                offsetof(typeof(field), item),         \
+                                sizeof(field.item), is_signed);        \
+       if (ret)                                                        \
+               return ret;
+
+#undef TRACE_FIELD_ZERO_CHAR
+#define TRACE_FIELD_ZERO_CHAR(item)
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)     \
+int                                                                    \
+ftrace_define_fields_##call(void)                                      \
+{                                                                      \
+       struct ftrace_event_call *event_call = &event_##call;           \
+       struct args field;                                              \
+       int ret;                                                        \
+                                                                       \
+       __common_field(unsigned char, type, 0);                         \
+       __common_field(unsigned char, flags, 0);                        \
+       __common_field(unsigned char, preempt_count, 0);                \
+       __common_field(int, pid, 1);                                    \
+       __common_field(int, tgid, 1);                                   \
+                                                                       \
+       tstruct;                                                        \
+                                                                       \
+       return ret;                                                     \
  }
+
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,   \
+                                   tpfmt)
+
  #include "trace_event_types.h"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c

index d28687e7b3a7b36859f1ac48fa1ac62d5e52f456..8b592418d8b28953a28e40c2202a50cb2b40dd37 100644 (file)
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -65,6 +65,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
         if (!current->ret_stack)
                 return -EBUSY;
  
+       /*
+        * We must make sure the ret_stack is tested before we read
+        * anything else.
+        */
+       smp_rmb();
+
         /* The return trace stack is full */
         if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
                 atomic_inc(&current->trace_overrun);
@@ -78,13 +84,14 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
         current->ret_stack[index].ret = ret;
         current->ret_stack[index].func = func;
         current->ret_stack[index].calltime = calltime;
+       current->ret_stack[index].subtime = 0;
         *depth = index;
  
         return 0;
  }
  
  /* Retrieve a function return address to the trace stack on thread info.*/
-void
+static void
  ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
  {
         int index;
@@ -104,9 +111,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
         trace->calltime = current->ret_stack[index].calltime;
         trace->overrun = atomic_read(&current->trace_overrun);
         trace->depth = index;
-       barrier();
-       current->curr_ret_stack--;
-
  }
  
  /*
@@ -121,6 +125,8 @@ unsigned long ftrace_return_to_handler(void)
         ftrace_pop_return_trace(&trace, &ret);
         trace.rettime = trace_clock_local();
         ftrace_graph_return(&trace);
+       barrier();
+       current->curr_ret_stack--;
  
         if (unlikely(!ret)) {
                 ftrace_graph_stop();
@@ -426,8 +432,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
         return TRACE_TYPE_HANDLED;
  }
  
-static enum print_line_t
-print_graph_duration(unsigned long long duration, struct trace_seq *s)
+enum print_line_t
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
  {
         unsigned long nsecs_rem = do_div(duration, 1000);
         /* log10(ULONG_MAX) + '\0' */
@@ -464,12 +470,23 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
                 if (!ret)
                         return TRACE_TYPE_PARTIAL_LINE;
         }
+       return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+print_graph_duration(unsigned long long duration, struct trace_seq *s)
+{
+       int ret;
+
+       ret = trace_print_graph_duration(duration, s);
+       if (ret != TRACE_TYPE_HANDLED)
+               return ret;
  
         ret = trace_seq_printf(s, "|  ");
         if (!ret)
                 return TRACE_TYPE_PARTIAL_LINE;
-       return TRACE_TYPE_HANDLED;
  
+       return TRACE_TYPE_HANDLED;
  }
  
  /* Case of a leaf function on its call entry */
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c

index 7bfdf4c2347f38251acb30e6d3ef3ad588c89e30..ca7d7c4d0c2aef41b74585996001d993ff4505e2 100644 (file)
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -1,10 +1,9 @@
  /*
- * h/w branch tracer for x86 based on bts
+ * h/w branch tracer for x86 based on BTS
   *
   * Copyright (C) 2008-2009 Intel Corporation.
   * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
   */
-#include <linux/spinlock.h>
  #include <linux/kallsyms.h>
  #include <linux/debugfs.h>
  #include <linux/ftrace.h>
@@ -15,110 +14,119 @@
  
  #include <asm/ds.h>
  
-#include "trace.h"
  #include "trace_output.h"
+#include "trace.h"
  
  
-#define SIZEOF_BTS (1 << 13)
+#define BTS_BUFFER_SIZE (1 << 13)
  
-/*
- * The tracer lock protects the below per-cpu tracer array.
- * It needs to be held to:
- * - start tracing on all cpus
- * - stop tracing on all cpus
- * - start tracing on a single hotplug cpu
- * - stop tracing on a single hotplug cpu
- * - read the trace from all cpus
- * - read the trace from a single cpu
- */
-static DEFINE_SPINLOCK(bts_tracer_lock);
  static DEFINE_PER_CPU(struct bts_tracer *, tracer);
-static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
+static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);
  
  #define this_tracer per_cpu(tracer, smp_processor_id())
-#define this_buffer per_cpu(buffer, smp_processor_id())
  
-static int __read_mostly trace_hw_branches_enabled;
+static int trace_hw_branches_enabled __read_mostly;
+static int trace_hw_branches_suspended __read_mostly;
  static struct trace_array *hw_branch_trace __read_mostly;
  
  
-/*
- * Start tracing on the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_start_cpu(void *arg)
+static void bts_trace_init_cpu(int cpu)
  {
-       if (this_tracer)
-               ds_release_bts(this_tracer);
-
-       this_tracer =
-               ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
-                              /* ovfl = */ NULL, /* th = */ (size_t)-1,
-                              BTS_KERNEL);
-       if (IS_ERR(this_tracer)) {
-               this_tracer = NULL;
-               return;
-       }
+       per_cpu(tracer, cpu) =
+               ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE,
+                                  NULL, (size_t)-1, BTS_KERNEL);
+
+       if (IS_ERR(per_cpu(tracer, cpu)))
+               per_cpu(tracer, cpu) = NULL;
  }
  
-static void bts_trace_start(struct trace_array *tr)
+static int bts_trace_init(struct trace_array *tr)
  {
-       spin_lock(&bts_tracer_lock);
+       int cpu;
+
+       hw_branch_trace = tr;
+       trace_hw_branches_enabled = 0;
  
-       on_each_cpu(bts_trace_start_cpu, NULL, 1);
-       trace_hw_branches_enabled = 1;
+       get_online_cpus();
+       for_each_online_cpu(cpu) {
+               bts_trace_init_cpu(cpu);
  
-       spin_unlock(&bts_tracer_lock);
+               if (likely(per_cpu(tracer, cpu)))
+                       trace_hw_branches_enabled = 1;
+       }
+       trace_hw_branches_suspended = 0;
+       put_online_cpus();
+
+       /* If we could not enable tracing on a single cpu, we fail. */
+       return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
  }
  
-/*
- * Stop tracing on the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_stop_cpu(void *arg)
+static void bts_trace_reset(struct trace_array *tr)
  {
-       if (this_tracer) {
-               ds_release_bts(this_tracer);
-               this_tracer = NULL;
+       int cpu;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu) {
+               if (likely(per_cpu(tracer, cpu))) {
+                       ds_release_bts(per_cpu(tracer, cpu));
+                       per_cpu(tracer, cpu) = NULL;
+               }
         }
+       trace_hw_branches_enabled = 0;
+       trace_hw_branches_suspended = 0;
+       put_online_cpus();
  }
  
-static void bts_trace_stop(struct trace_array *tr)
+static void bts_trace_start(struct trace_array *tr)
  {
-       spin_lock(&bts_tracer_lock);
+       int cpu;
  
-       trace_hw_branches_enabled = 0;
-       on_each_cpu(bts_trace_stop_cpu, NULL, 1);
+       get_online_cpus();
+       for_each_online_cpu(cpu)
+               if (likely(per_cpu(tracer, cpu)))
+                       ds_resume_bts(per_cpu(tracer, cpu));
+       trace_hw_branches_suspended = 0;
+       put_online_cpus();
+}
  
-       spin_unlock(&bts_tracer_lock);
+static void bts_trace_stop(struct trace_array *tr)
+{
+       int cpu;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu)
+               if (likely(per_cpu(tracer, cpu)))
+                       ds_suspend_bts(per_cpu(tracer, cpu));
+       trace_hw_branches_suspended = 1;
+       put_online_cpus();
  }
  
  static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
  {
-       unsigned int cpu = (unsigned long)hcpu;
-
-       spin_lock(&bts_tracer_lock);
-
-       if (!trace_hw_branches_enabled)
-               goto out;
+       int cpu = (long)hcpu;
  
         switch (action) {
         case CPU_ONLINE:
         case CPU_DOWN_FAILED:
-               smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+               /* The notification is sent with interrupts enabled. */
+               if (trace_hw_branches_enabled) {
+                       bts_trace_init_cpu(cpu);
+
+                       if (trace_hw_branches_suspended &&
+                           likely(per_cpu(tracer, cpu)))
+                               ds_suspend_bts(per_cpu(tracer, cpu));
+               }
                 break;
+
         case CPU_DOWN_PREPARE:
-               smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
-               break;
+               /* The notification is sent with interrupts enabled. */
+               if (likely(per_cpu(tracer, cpu))) {
+                       ds_release_bts(per_cpu(tracer, cpu));
+                       per_cpu(tracer, cpu) = NULL;
+               }
         }
  
- out:
-       spin_unlock(&bts_tracer_lock);
         return NOTIFY_DONE;
  }
  
@@ -126,20 +134,6 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
         .notifier_call = bts_hotcpu_handler
  };
  
-static int bts_trace_init(struct trace_array *tr)
-{
-       hw_branch_trace = tr;
-
-       bts_trace_start(tr);
-
-       return 0;
-}
-
-static void bts_trace_reset(struct trace_array *tr)
-{
-       bts_trace_stop(tr);
-}
-
  static void bts_trace_print_header(struct seq_file *m)
  {
         seq_puts(m, "# CPU#        TO  <-  FROM\n");
@@ -147,10 +141,10 @@ static void bts_trace_print_header(struct seq_file *m)
  
  static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
  {
+       unsigned long symflags = TRACE_ITER_SYM_OFFSET;
         struct trace_entry *entry = iter->ent;
         struct trace_seq *seq = &iter->seq;
         struct hw_branch_entry *it;
-       unsigned long symflags = TRACE_ITER_SYM_OFFSET;
  
         trace_assign_type(it, entry);
  
@@ -168,6 +162,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
  
  void trace_hw_branch(u64 from, u64 to)
  {
+       struct ftrace_event_call *call = &event_hw_branch;
         struct trace_array *tr = hw_branch_trace;
         struct ring_buffer_event *event;
         struct hw_branch_entry *entry;
@@ -194,7 +189,8 @@ void trace_hw_branch(u64 from, u64 to)
         entry->ent.type = TRACE_HW_BRANCHES;
         entry->from = from;
         entry->to   = to;
-       trace_buffer_unlock_commit(tr, event, 0, 0);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               trace_buffer_unlock_commit(tr, event, 0, 0);
  
   out:
         atomic_dec(&tr->data[cpu]->disabled);
@@ -224,11 +220,11 @@ static void trace_bts_at(const struct bts_trace *trace, void *at)
  /*
   * Collect the trace on the current cpu and write it into the ftrace buffer.
   *
- * pre: bts_tracer_lock must be locked
+ * pre: tracing must be suspended on the current cpu
   */
  static void trace_bts_cpu(void *arg)
  {
-       struct trace_array *tr = (struct trace_array *) arg;
+       struct trace_array *tr = (struct trace_array *)arg;
         const struct bts_trace *trace;
         unsigned char *at;
  
@@ -241,10 +237,9 @@ static void trace_bts_cpu(void *arg)
         if (unlikely(!this_tracer))
                 return;
  
-       ds_suspend_bts(this_tracer);
         trace = ds_read_bts(this_tracer);
         if (!trace)
-               goto out;
+               return;
  
         for (at = trace->ds.top; (void *)at < trace->ds.end;
              at += trace->ds.size)
@@ -253,18 +248,27 @@ static void trace_bts_cpu(void *arg)
         for (at = trace->ds.begin; (void *)at < trace->ds.top;
              at += trace->ds.size)
                 trace_bts_at(trace, at);
-
-out:
-       ds_resume_bts(this_tracer);
  }
  
  static void trace_bts_prepare(struct trace_iterator *iter)
  {
-       spin_lock(&bts_tracer_lock);
+       int cpu;
  
+       get_online_cpus();
+       for_each_online_cpu(cpu)
+               if (likely(per_cpu(tracer, cpu)))
+                       ds_suspend_bts(per_cpu(tracer, cpu));
+       /*
+        * We need to collect the trace on the respective cpu since ftrace
+        * implicitly adds the record for the current cpu.
+        * Once that is more flexible, we could collect the data from any cpu.
+        */
         on_each_cpu(trace_bts_cpu, iter->tr, 1);
  
-       spin_unlock(&bts_tracer_lock);
+       for_each_online_cpu(cpu)
+               if (likely(per_cpu(tracer, cpu)))
+                       ds_resume_bts(per_cpu(tracer, cpu));
+       put_online_cpus();
  }
  
  static void trace_bts_close(struct trace_iterator *iter)
@@ -274,11 +278,11 @@ static void trace_bts_close(struct trace_iterator *iter)
  
  void trace_hw_branch_oops(void)
  {
-       spin_lock(&bts_tracer_lock);
-
-       trace_bts_cpu(hw_branch_trace);
-
-       spin_unlock(&bts_tracer_lock);
+       if (this_tracer) {
+               ds_suspend_bts_noirq(this_tracer);
+               trace_bts_cpu(hw_branch_trace);
+               ds_resume_bts_noirq(this_tracer);
+       }
  }
  
  struct tracer bts_tracer __read_mostly =
@@ -291,7 +295,10 @@ struct tracer bts_tracer __read_mostly =
         .start          = bts_trace_start,
         .stop           = bts_trace_stop,
         .open           = trace_bts_prepare,
-       .close          = trace_bts_close
+       .close          = trace_bts_close,
+#ifdef CONFIG_FTRACE_SELFTEST
+       .selftest       = trace_selftest_startup_hw_branches,
+#endif /* CONFIG_FTRACE_SELFTEST */
  };
  
  __init static int init_bts_trace(void)
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c

index 8e37fcddd8b49fdaf46e7cc145b4de0382b69b1e..d53b45ed080622933b659ca85774f6b13ae0538e 100644 (file)
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,8 @@
  #include <linux/kernel.h>
  #include <linux/mmiotrace.h>
  #include <linux/pci.h>
+#include <linux/time.h>
+
  #include <asm/atomic.h>
  
  #include "trace.h"
@@ -174,7 +176,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
         struct mmiotrace_rw *rw;
         struct trace_seq *s     = &iter->seq;
         unsigned long long t    = ns2usecs(iter->ts);
-       unsigned long usec_rem  = do_div(t, 1000000ULL);
+       unsigned long usec_rem  = do_div(t, USEC_PER_SEC);
         unsigned secs           = (unsigned long)t;
         int ret = 1;
  
@@ -221,7 +223,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
         struct mmiotrace_map *m;
         struct trace_seq *s     = &iter->seq;
         unsigned long long t    = ns2usecs(iter->ts);
-       unsigned long usec_rem  = do_div(t, 1000000ULL);
+       unsigned long usec_rem  = do_div(t, USEC_PER_SEC);
         unsigned secs           = (unsigned long)t;
         int ret;
  
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c

index 64b54a59c55b585ff3b979f216ba0451273ee6e4..7938f3ae93e3dbe9d898e037c47027c4908a8d94 100644 (file)
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,11 +14,25 @@
  /* must be a power of 2 */
  #define EVENT_HASHSIZE 128
  
-static DEFINE_MUTEX(trace_event_mutex);
+DECLARE_RWSEM(trace_event_mutex);
+
+DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
+EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
+
  static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
  
  static int next_event_type = __TRACE_LAST_TYPE + 1;
  
+void trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+       int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+
+       s->buffer[len] = 0;
+       seq_puts(m, s->buffer);
+
+       trace_seq_init(s);
+}
+
  enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
  {
         struct trace_seq *s = &iter->seq;
@@ -84,6 +98,39 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
  
         return len;
  }
+EXPORT_SYMBOL_GPL(trace_seq_printf);
+
+/**
+ * trace_seq_vprintf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+{
+       int len = (PAGE_SIZE - 1) - s->len;
+       int ret;
+
+       if (!len)
+               return 0;
+
+       ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+
+       /* If we can't write it all, don't bother writing anything */
+       if (ret >= len)
+               return 0;
+
+       s->len += ret;
+
+       return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_vprintf);
  
  int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
  {
@@ -201,6 +248,67 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
         return 0;
  }
  
+const char *
+ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
+                      unsigned long flags,
+                      const struct trace_print_flags *flag_array)
+{
+       unsigned long mask;
+       const char *str;
+       const char *ret = p->buffer + p->len;
+       int i;
+
+       for (i = 0;  flag_array[i].name && flags; i++) {
+
+               mask = flag_array[i].mask;
+               if ((flags & mask) != mask)
+                       continue;
+
+               str = flag_array[i].name;
+               flags &= ~mask;
+               if (p->len && delim)
+                       trace_seq_puts(p, delim);
+               trace_seq_puts(p, str);
+       }
+
+       /* check for left over flags */
+       if (flags) {
+               if (p->len && delim)
+                       trace_seq_puts(p, delim);
+               trace_seq_printf(p, "0x%lx", flags);
+       }
+
+       trace_seq_putc(p, 0);
+
+       return ret;
+}
+EXPORT_SYMBOL(ftrace_print_flags_seq);
+
+const char *
+ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+                        const struct trace_print_flags *symbol_array)
+{
+       int i;
+       const char *ret = p->buffer + p->len;
+
+       for (i = 0;  symbol_array[i].name; i++) {
+
+               if (val != symbol_array[i].mask)
+                       continue;
+
+               trace_seq_puts(p, symbol_array[i].name);
+               break;
+       }
+
+       if (!p->len)
+               trace_seq_printf(p, "0x%lx", val);
+               
+       trace_seq_putc(p, 0);
+
+       return ret;
+}
+EXPORT_SYMBOL(ftrace_print_symbols_seq);
+
  #ifdef CONFIG_KRETPROBES
  static inline const char *kretprobed(const char *name)
  {
@@ -311,17 +419,20 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
  
                 if (ip == ULONG_MAX || !ret)
                         break;
-               if (i && ret)
-                       ret = trace_seq_puts(s, " <- ");
+               if (ret)
+                       ret = trace_seq_puts(s, " => ");
                 if (!ip) {
                         if (ret)
                                 ret = trace_seq_puts(s, "??");
+                       if (ret)
+                               ret = trace_seq_puts(s, "\n");
                         continue;
                 }
                 if (!ret)
                         break;
                 if (ret)
                         ret = seq_print_user_ip(s, mm, ip, sym_flags);
+               ret = trace_seq_puts(s, "\n");
         }
  
         if (mm)
@@ -455,6 +566,7 @@ static int task_state_char(unsigned long state)
   * @type: the type of event to look for
   *
   * Returns an event of type @type otherwise NULL
+ * Called with trace_event_read_lock() held.
   */
  struct trace_event *ftrace_find_event(int type)
  {
@@ -464,7 +576,7 @@ struct trace_event *ftrace_find_event(int type)
  
         key = type & (EVENT_HASHSIZE - 1);
  
-       hlist_for_each_entry_rcu(event, n, &event_hash[key], node) {
+       hlist_for_each_entry(event, n, &event_hash[key], node) {
                 if (event->type == type)
                         return event;
         }
@@ -472,6 +584,46 @@ struct trace_event *ftrace_find_event(int type)
         return NULL;
  }
  
+static LIST_HEAD(ftrace_event_list);
+
+static int trace_search_list(struct list_head **list)
+{
+       struct trace_event *e;
+       int last = __TRACE_LAST_TYPE;
+
+       if (list_empty(&ftrace_event_list)) {
+               *list = &ftrace_event_list;
+               return last + 1;
+       }
+
+       /*
+        * We used up all possible max events,
+        * lets see if somebody freed one.
+        */
+       list_for_each_entry(e, &ftrace_event_list, list) {
+               if (e->type != last + 1)
+                       break;
+               last++;
+       }
+
+       /* Did we used up all 65 thousand events??? */
+       if ((last + 1) > FTRACE_MAX_EVENT)
+               return 0;
+
+       *list = &e->list;
+       return last + 1;
+}
+
+void trace_event_read_lock(void)
+{
+       down_read(&trace_event_mutex);
+}
+
+void trace_event_read_unlock(void)
+{
+       up_read(&trace_event_mutex);
+}
+
  /**
   * register_ftrace_event - register output for an event type
   * @event: the event type to register
@@ -492,22 +644,42 @@ int register_ftrace_event(struct trace_event *event)
         unsigned key;
         int ret = 0;
  
-       mutex_lock(&trace_event_mutex);
+       down_write(&trace_event_mutex);
  
-       if (!event) {
-               ret = next_event_type++;
+       if (WARN_ON(!event))
                 goto out;
-       }
  
-       if (!event->type)
-               event->type = next_event_type++;
-       else if (event->type > __TRACE_LAST_TYPE) {
+       INIT_LIST_HEAD(&event->list);
+
+       if (!event->type) {
+               struct list_head *list = NULL;
+
+               if (next_event_type > FTRACE_MAX_EVENT) {
+
+                       event->type = trace_search_list(&list);
+                       if (!event->type)
+                               goto out;
+
+               } else {
+                       
+                       event->type = next_event_type++;
+                       list = &ftrace_event_list;
+               }
+
+               if (WARN_ON(ftrace_find_event(event->type)))
+                       goto out;
+
+               list_add_tail(&event->list, list);
+
+       } else if (event->type > __TRACE_LAST_TYPE) {
                 printk(KERN_WARNING "Need to add type to trace.h\n");
                 WARN_ON(1);
-       }
-
-       if (ftrace_find_event(event->type))
                 goto out;
+       } else {
+               /* Is this event already used */
+               if (ftrace_find_event(event->type))
+                       goto out;
+       }
  
         if (event->trace == NULL)
                 event->trace = trace_nop_print;
@@ -520,14 +692,25 @@ int register_ftrace_event(struct trace_event *event)
  
         key = event->type & (EVENT_HASHSIZE - 1);
  
-       hlist_add_head_rcu(&event->node, &event_hash[key]);
+       hlist_add_head(&event->node, &event_hash[key]);
  
         ret = event->type;
   out:
-       mutex_unlock(&trace_event_mutex);
+       up_write(&trace_event_mutex);
  
         return ret;
  }
+EXPORT_SYMBOL_GPL(register_ftrace_event);
+
+/*
+ * Used by module code with the trace_event_mutex held for write.
+ */
+int __unregister_ftrace_event(struct trace_event *event)
+{
+       hlist_del(&event->node);
+       list_del(&event->list);
+       return 0;
+}
  
  /**
   * unregister_ftrace_event - remove a no longer used event
@@ -535,12 +718,13 @@ int register_ftrace_event(struct trace_event *event)
   */
  int unregister_ftrace_event(struct trace_event *event)
  {
-       mutex_lock(&trace_event_mutex);
-       hlist_del(&event->node);
-       mutex_unlock(&trace_event_mutex);
+       down_write(&trace_event_mutex);
+       __unregister_ftrace_event(event);
+       up_write(&trace_event_mutex);
  
         return 0;
  }
+EXPORT_SYMBOL_GPL(unregister_ftrace_event);
  
  /*
   * Standard events
@@ -833,14 +1017,16 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
  
         trace_assign_type(field, iter->ent);
  
+       if (!trace_seq_puts(s, "<stack trace>\n"))
+               goto partial;
         for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-               if (i) {
-                       if (!trace_seq_puts(s, " <= "))
-                               goto partial;
+               if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
+                       break;
+               if (!trace_seq_puts(s, " => "))
+                       goto partial;
  
-                       if (!seq_print_ip_sym(s, field->caller[i], flags))
-                               goto partial;
-               }
+               if (!seq_print_ip_sym(s, field->caller[i], flags))
+                       goto partial;
                 if (!trace_seq_puts(s, "\n"))
                         goto partial;
         }
@@ -868,10 +1054,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
  
         trace_assign_type(field, iter->ent);
  
-       if (!seq_print_userip_objs(field, s, flags))
+       if (!trace_seq_puts(s, "<user stack trace>\n"))
                 goto partial;
  
-       if (!trace_seq_putc(s, '\n'))
+       if (!seq_print_userip_objs(field, s, flags))
                 goto partial;
  
         return TRACE_TYPE_HANDLED;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h

index e0bde39c2dd9b00ab8ac5388aa0b5168fe5f37d2..d38bec4a9c3081e52bca0dec51e04e94827cba67 100644 (file)
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -1,41 +1,17 @@
  #ifndef __TRACE_EVENTS_H
  #define __TRACE_EVENTS_H
  
+#include <linux/trace_seq.h>
  #include "trace.h"
  
-typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
-                                             int flags);
-
-struct trace_event {
-       struct hlist_node       node;
-       int                     type;
-       trace_print_func        trace;
-       trace_print_func        raw;
-       trace_print_func        hex;
-       trace_print_func        binary;
-};
-
  extern enum print_line_t
  trace_print_bprintk_msg_only(struct trace_iterator *iter);
  extern enum print_line_t
  trace_print_printk_msg_only(struct trace_iterator *iter);
  
-extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-       __attribute__ ((format (printf, 2, 3)));
-extern int
-trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
  extern int
  seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
                 unsigned long sym_flags);
-extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
-                                size_t cnt);
-extern int trace_seq_puts(struct trace_seq *s, const char *str);
-extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
-extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
-extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
-                               size_t len);
-extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
-extern int trace_seq_path(struct trace_seq *s, struct path *path);
  extern int seq_print_userip_objs(const struct userstack_entry *entry,
                                  struct trace_seq *s, unsigned long sym_flags);
  extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
@@ -44,13 +20,17 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
  extern int trace_print_context(struct trace_iterator *iter);
  extern int trace_print_lat_context(struct trace_iterator *iter);
  
+extern void trace_event_read_lock(void);
+extern void trace_event_read_unlock(void);
  extern struct trace_event *ftrace_find_event(int type);
-extern int register_ftrace_event(struct trace_event *event);
-extern int unregister_ftrace_event(struct trace_event *event);
  
  extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
                                          int flags);
  
+/* used by module unregistering */
+extern int __unregister_ftrace_event(struct trace_event *event);
+extern struct rw_semaphore trace_event_mutex;
+
  #define MAX_MEMHEX_BYTES       8
  #define HEX_CHARS              (MAX_MEMHEX_BYTES*2 + 1)
  
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c

index 118439709fb771f4fa2ad576454e0b64d1274144..8a30d9874cd430d507c3f9e997610430f1bcb894 100644 (file)
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -36,6 +36,7 @@ static void probe_power_start(struct power_trace *it, unsigned int type,
  
  static void probe_power_end(struct power_trace *it)
  {
+       struct ftrace_event_call *call = &event_power;
         struct ring_buffer_event *event;
         struct trace_power *entry;
         struct trace_array_cpu *data;
@@ -54,7 +55,8 @@ static void probe_power_end(struct power_trace *it)
                 goto out;
         entry   = ring_buffer_event_data(event);
         entry->state_data = *it;
-       trace_buffer_unlock_commit(tr, event, 0, 0);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               trace_buffer_unlock_commit(tr, event, 0, 0);
   out:
         preempt_enable();
  }
@@ -62,6 +64,7 @@ static void probe_power_end(struct power_trace *it)
  static void probe_power_mark(struct power_trace *it, unsigned int type,
                                 unsigned int level)
  {
+       struct ftrace_event_call *call = &event_power;
         struct ring_buffer_event *event;
         struct trace_power *entry;
         struct trace_array_cpu *data;
@@ -84,7 +87,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
                 goto out;
         entry   = ring_buffer_event_data(event);
         entry->state_data = *it;
-       trace_buffer_unlock_commit(tr, event, 0, 0);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               trace_buffer_unlock_commit(tr, event, 0, 0);
   out:
         preempt_enable();
  }
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c

index eb81556107fec71dc8ba53278df9264862df0f3b..9bece9687b62a8d2ab6b59199017ef0ad7898b98 100644 (file)
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -245,17 +245,13 @@ static const struct file_operations ftrace_formats_fops = {
  static __init int init_trace_printk_function_export(void)
  {
         struct dentry *d_tracer;
-       struct dentry *entry;
  
         d_tracer = tracing_init_dentry();
         if (!d_tracer)
                 return 0;
  
-       entry = debugfs_create_file("printk_formats", 0444, d_tracer,
+       trace_create_file("printk_formats", 0444, d_tracer,
                                     NULL, &ftrace_formats_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'printk_formats' entry\n");
  
         return 0;
  }
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c

index 9117cea6f1ae78f17f2be7fc9894aa9b0abe2075..a98106dd979cfd7bd305cfab5a506fec61e95e46 100644 (file)
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -10,7 +10,7 @@
  #include <linux/kallsyms.h>
  #include <linux/uaccess.h>
  #include <linux/ftrace.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
  
  #include "trace.h"
  
@@ -29,13 +29,13 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
         int cpu;
         int pc;
  
-       if (!sched_ref || sched_stopped)
+       if (unlikely(!sched_ref))
                 return;
  
         tracing_record_cmdline(prev);
         tracing_record_cmdline(next);
  
-       if (!tracer_enabled)
+       if (!tracer_enabled || sched_stopped)
                 return;
  
         pc = preempt_count();
@@ -56,15 +56,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
         unsigned long flags;
         int cpu, pc;
  
-       if (!likely(tracer_enabled))
+       if (unlikely(!sched_ref))
                 return;
  
-       pc = preempt_count();
         tracing_record_cmdline(current);
  
-       if (sched_stopped)
+       if (!tracer_enabled || sched_stopped)
                 return;
  
+       pc = preempt_count();
         local_irq_save(flags);
         cpu = raw_smp_processor_id();
         data = ctx_trace->data[cpu];
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c

index 5bc00e8f153ebb8682589caa37b553b05800f6b0..eacb272251736276335a02d04a83b856ec550dfb 100644 (file)
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,7 +15,7 @@
  #include <linux/kallsyms.h>
  #include <linux/uaccess.h>
  #include <linux/ftrace.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
  
  #include "trace.h"
  
@@ -138,9 +138,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
  
         pc = preempt_count();
  
-       /* The task we are waiting for is waking up */
-       data = wakeup_trace->data[wakeup_cpu];
-
         /* disable local data, not wakeup_cpu data */
         cpu = raw_smp_processor_id();
         disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
@@ -154,6 +151,9 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
         if (unlikely(!tracer_enabled || next != wakeup_task))
                 goto out_unlock;
  
+       /* The task we are waiting for is waking up */
+       data = wakeup_trace->data[wakeup_cpu];
+
         trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
         tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
  
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c

index 08f4eb2763d141bbae33c518a4de1e15d1c3614a..00dd6485bdd7e7abf390a6fcbe1c27d626a8531e 100644 (file)
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -16,6 +16,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
         case TRACE_BRANCH:
         case TRACE_GRAPH_ENT:
         case TRACE_GRAPH_RET:
+       case TRACE_HW_BRANCHES:
                 return 1;
         }
         return 0;
@@ -188,6 +189,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
  #else
  # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
  #endif /* CONFIG_DYNAMIC_FTRACE */
+
  /*
   * Simple verification test of ftrace function tracer.
   * Enable ftrace, sleep 1/10 second, and then read the trace
@@ -749,3 +751,59 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
         return ret;
  }
  #endif /* CONFIG_BRANCH_TRACER */
+
+#ifdef CONFIG_HW_BRANCH_TRACER
+int
+trace_selftest_startup_hw_branches(struct tracer *trace,
+                                  struct trace_array *tr)
+{
+       struct trace_iterator *iter;
+       struct tracer tracer;
+       unsigned long count;
+       int ret;
+
+       if (!trace->open) {
+               printk(KERN_CONT "missing open function...");
+               return -1;
+       }
+
+       ret = tracer_init(trace, tr);
+       if (ret) {
+               warn_failed_init_tracer(trace, ret);
+               return ret;
+       }
+
+       /*
+        * The hw-branch tracer needs to collect the trace from the various
+        * cpu trace buffers - before tracing is stopped.
+        */
+       iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+       if (!iter)
+               return -ENOMEM;
+
+       memcpy(&tracer, trace, sizeof(tracer));
+
+       iter->trace = &tracer;
+       iter->tr = tr;
+       iter->pos = -1;
+       mutex_init(&iter->mutex);
+
+       trace->open(iter);
+
+       mutex_destroy(&iter->mutex);
+       kfree(iter);
+
+       tracing_stop();
+
+       ret = trace_test_buffer(tr, &count);
+       trace->reset(tr);
+       tracing_start();
+
+       if (!ret && !count) {
+               printk(KERN_CONT "no entries found..");
+               ret = -1;
+       }
+
+       return ret;
+}
+#endif /* CONFIG_HW_BRANCH_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c

index c750f65f9661531273ee16553623b405268fc758..2d7aebd71dbd4e3489b8bf59fd29585b422be8e9 100644 (file)
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -265,7 +265,7 @@ static int t_show(struct seq_file *m, void *v)
                 seq_printf(m, "        Depth    Size   Location"
                            "    (%d entries)\n"
                            "        -----    ----   --------\n",
-                          max_stack_trace.nr_entries);
+                          max_stack_trace.nr_entries - 1);
  
                 if (!stack_tracer_enabled && !max_stack_size)
                         print_disabled(m);
@@ -352,19 +352,14 @@ __setup("stacktrace", enable_stacktrace);
  static __init int stack_trace_init(void)
  {
         struct dentry *d_tracer;
-       struct dentry *entry;
  
         d_tracer = tracing_init_dentry();
  
-       entry = debugfs_create_file("stack_max_size", 0644, d_tracer,
-                                   &max_stack_size, &stack_max_size_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'stack_max_size' entry\n");
+       trace_create_file("stack_max_size", 0644, d_tracer,
+                       &max_stack_size, &stack_max_size_fops);
  
-       entry = debugfs_create_file("stack_trace", 0444, d_tracer,
-                                   NULL, &stack_trace_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'stack_trace' entry\n");
+       trace_create_file("stack_trace", 0444, d_tracer,
+                       NULL, &stack_trace_fops);
  
         if (stack_tracer_enabled)
                 register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c

index acdebd771a93b9623e57b8ed922c988524254ba7..c00643733f4ccca7be15b22ef311ec03711edc48 100644 (file)
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -1,7 +1,7 @@
  /*
   * Infrastructure for statistic tracing (histogram output).
   *
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ * Copyright (C) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
   *
   * Based on the code from trace_branch.c which is
   * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
@@ -10,22 +10,27 @@
  
  
  #include <linux/list.h>
+#include <linux/rbtree.h>
  #include <linux/debugfs.h>
  #include "trace_stat.h"
  #include "trace.h"
  
  
-/* List of stat entries from a tracer */
-struct trace_stat_list {
-       struct list_head        list;
+/*
+ * List of stat red-black nodes from a tracer
+ * We use a such tree to sort quickly the stat
+ * entries from the tracer.
+ */
+struct stat_node {
+       struct rb_node          node;
         void                    *stat;
  };
  
  /* A stat session is the stats output in one file */
-struct tracer_stat_session {
+struct stat_session {
         struct list_head        session_list;
         struct tracer_stat      *ts;
-       struct list_head        stat_list;
+       struct rb_root          stat_root;
         struct mutex            stat_mutex;
         struct dentry           *file;
  };
@@ -37,18 +42,48 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);
  /* The root directory for all stat files */
  static struct dentry           *stat_dir;
  
+/*
+ * Iterate through the rbtree using a post order traversal path
+ * to release the next node.
+ * It won't necessary release one at each iteration
+ * but it will at least advance closer to the next one
+ * to be released.
+ */
+static struct rb_node *release_next(struct rb_node *node)
+{
+       struct stat_node *snode;
+       struct rb_node *parent = rb_parent(node);
+
+       if (node->rb_left)
+               return node->rb_left;
+       else if (node->rb_right)
+               return node->rb_right;
+       else {
+               if (!parent)
+                       ;
+               else if (parent->rb_left == node)
+                       parent->rb_left = NULL;
+               else
+                       parent->rb_right = NULL;
+
+               snode = container_of(node, struct stat_node, node);
+               kfree(snode);
+
+               return parent;
+       }
+}
  
-static void reset_stat_session(struct tracer_stat_session *session)
+static void reset_stat_session(struct stat_session *session)
  {
-       struct trace_stat_list *node, *next;
+       struct rb_node *node = session->stat_root.rb_node;
  
-       list_for_each_entry_safe(node, next, &session->stat_list, list)
-               kfree(node);
+       while (node)
+               node = release_next(node);
  
-       INIT_LIST_HEAD(&session->stat_list);
+       session->stat_root = RB_ROOT;
  }
  
-static void destroy_session(struct tracer_stat_session *session)
+static void destroy_session(struct stat_session *session)
  {
         debugfs_remove(session->file);
         reset_stat_session(session);
@@ -56,25 +91,60 @@ static void destroy_session(struct tracer_stat_session *session)
         kfree(session);
  }
  
+typedef int (*cmp_stat_t)(void *, void *);
+
+static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp)
+{
+       struct rb_node **new = &(root->rb_node), *parent = NULL;
+       struct stat_node *data;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return -ENOMEM;
+       data->stat = stat;
+
+       /*
+        * Figure out where to put new node
+        * This is a descendent sorting
+        */
+       while (*new) {
+               struct stat_node *this;
+               int result;
+
+               this = container_of(*new, struct stat_node, node);
+               result = cmp(data->stat, this->stat);
+
+               parent = *new;
+               if (result >= 0)
+                       new = &((*new)->rb_left);
+               else
+                       new = &((*new)->rb_right);
+       }
+
+       rb_link_node(&data->node, parent, new);
+       rb_insert_color(&data->node, root);
+       return 0;
+}
+
  /*
   * For tracers that don't provide a stat_cmp callback.
- * This one will force an immediate insertion on tail of
- * the list.
+ * This one will force an insertion as right-most node
+ * in the rbtree.
   */
  static int dummy_cmp(void *p1, void *p2)
  {
-       return 1;
+       return -1;
  }
  
  /*
- * Initialize the stat list at each trace_stat file opening.
+ * Initialize the stat rbtree at each trace_stat file opening.
   * All of these copies and sorting are required on all opening
   * since the stats could have changed between two file sessions.
   */
-static int stat_seq_init(struct tracer_stat_session *session)
+static int stat_seq_init(struct stat_session *session)
  {
-       struct trace_stat_list *iter_entry, *new_entry;
         struct tracer_stat *ts = session->ts;
+       struct rb_root *root = &session->stat_root;
         void *stat;
         int ret = 0;
         int i;
@@ -85,29 +155,16 @@ static int stat_seq_init(struct tracer_stat_session *session)
         if (!ts->stat_cmp)
                 ts->stat_cmp = dummy_cmp;
  
-       stat = ts->stat_start();
+       stat = ts->stat_start(ts);
         if (!stat)
                 goto exit;
  
-       /*
-        * The first entry. Actually this is the second, but the first
-        * one (the stat_list head) is pointless.
-        */
-       new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
-       if (!new_entry) {
-               ret = -ENOMEM;
+       ret = insert_stat(root, stat, ts->stat_cmp);
+       if (ret)
                 goto exit;
-       }
-
-       INIT_LIST_HEAD(&new_entry->list);
-
-       list_add(&new_entry->list, &session->stat_list);
-
-       new_entry->stat = stat;
  
         /*
-        * Iterate over the tracer stat entries and store them in a sorted
-        * list.
+        * Iterate over the tracer stat entries and store them in an rbtree.
          */
         for (i = 1; ; i++) {
                 stat = ts->stat_next(stat, i);
@@ -116,36 +173,16 @@ static int stat_seq_init(struct tracer_stat_session *session)
                 if (!stat)
                         break;
  
-               new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
-               if (!new_entry) {
-                       ret = -ENOMEM;
-                       goto exit_free_list;
-               }
-
-               INIT_LIST_HEAD(&new_entry->list);
-               new_entry->stat = stat;
-
-               list_for_each_entry_reverse(iter_entry, &session->stat_list,
-                               list) {
-
-                       /* Insertion with a descendent sorting */
-                       if (ts->stat_cmp(iter_entry->stat,
-                                       new_entry->stat) >= 0) {
-
-                               list_add(&new_entry->list, &iter_entry->list);
-                               break;
-                       }
-               }
-
-               /* The current larger value */
-               if (list_empty(&new_entry->list))
-                       list_add(&new_entry->list, &session->stat_list);
+               ret = insert_stat(root, stat, ts->stat_cmp);
+               if (ret)
+                       goto exit_free_rbtree;
         }
+
  exit:
         mutex_unlock(&session->stat_mutex);
         return ret;
  
-exit_free_list:
+exit_free_rbtree:
         reset_stat_session(session);
         mutex_unlock(&session->stat_mutex);
         return ret;
@@ -154,38 +191,51 @@ exit_free_list:
  
  static void *stat_seq_start(struct seq_file *s, loff_t *pos)
  {
-       struct tracer_stat_session *session = s->private;
+       struct stat_session *session = s->private;
+       struct rb_node *node;
+       int i;
  
-       /* Prevent from tracer switch or stat_list modification */
+       /* Prevent from tracer switch or rbtree modification */
         mutex_lock(&session->stat_mutex);
  
         /* If we are in the beginning of the file, print the headers */
-       if (!*pos && session->ts->stat_headers)
+       if (!*pos && session->ts->stat_headers) {
+               (*pos)++;
                 return SEQ_START_TOKEN;
+       }
  
-       return seq_list_start(&session->stat_list, *pos);
+       node = rb_first(&session->stat_root);
+       for (i = 0; node && i < *pos; i++)
+               node = rb_next(node);
+
+       (*pos)++;
+
+       return node;
  }
  
  static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
  {
-       struct tracer_stat_session *session = s->private;
+       struct stat_session *session = s->private;
+       struct rb_node *node = p;
+
+       (*pos)++;
  
         if (p == SEQ_START_TOKEN)
-               return seq_list_start(&session->stat_list, *pos);
+               return rb_first(&session->stat_root);
  
-       return seq_list_next(p, &session->stat_list, pos);
+       return rb_next(node);
  }
  
  static void stat_seq_stop(struct seq_file *s, void *p)
  {
-       struct tracer_stat_session *session = s->private;
+       struct stat_session *session = s->private;
         mutex_unlock(&session->stat_mutex);
  }
  
  static int stat_seq_show(struct seq_file *s, void *v)
  {
-       struct tracer_stat_session *session = s->private;
-       struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
+       struct stat_session *session = s->private;
+       struct stat_node *l = container_of(v, struct stat_node, node);
  
         if (v == SEQ_START_TOKEN)
                 return session->ts->stat_headers(s);
@@ -205,7 +255,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
  {
         int ret;
  
-       struct tracer_stat_session *session = inode->i_private;
+       struct stat_session *session = inode->i_private;
  
         ret = seq_open(file, &trace_stat_seq_ops);
         if (!ret) {
@@ -218,11 +268,11 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
  }
  
  /*
- * Avoid consuming memory with our now useless list.
+ * Avoid consuming memory with our now useless rbtree.
   */
  static int tracing_stat_release(struct inode *i, struct file *f)
  {
-       struct tracer_stat_session *session = i->i_private;
+       struct stat_session *session = i->i_private;
  
         mutex_lock(&session->stat_mutex);
         reset_stat_session(session);
@@ -251,7 +301,7 @@ static int tracing_stat_init(void)
         return 0;
  }
  
-static int init_stat_file(struct tracer_stat_session *session)
+static int init_stat_file(struct stat_session *session)
  {
         if (!stat_dir && tracing_stat_init())
                 return -ENODEV;
@@ -266,7 +316,7 @@ static int init_stat_file(struct tracer_stat_session *session)
  
  int register_stat_tracer(struct tracer_stat *trace)
  {
-       struct tracer_stat_session *session, *node, *tmp;
+       struct stat_session *session, *node;
         int ret;
  
         if (!trace)
@@ -277,7 +327,7 @@ int register_stat_tracer(struct tracer_stat *trace)
  
         /* Already registered? */
         mutex_lock(&all_stat_sessions_mutex);
-       list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+       list_for_each_entry(node, &all_stat_sessions, session_list) {
                 if (node->ts == trace) {
                         mutex_unlock(&all_stat_sessions_mutex);
                         return -EINVAL;
@@ -286,15 +336,13 @@ int register_stat_tracer(struct tracer_stat *trace)
         mutex_unlock(&all_stat_sessions_mutex);
  
         /* Init the session */
-       session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
+       session = kzalloc(sizeof(*session), GFP_KERNEL);
         if (!session)
                 return -ENOMEM;
  
         session->ts = trace;
         INIT_LIST_HEAD(&session->session_list);
-       INIT_LIST_HEAD(&session->stat_list);
         mutex_init(&session->stat_mutex);
-       session->file = NULL;
  
         ret = init_stat_file(session);
         if (ret) {
@@ -312,7 +360,7 @@ int register_stat_tracer(struct tracer_stat *trace)
  
  void unregister_stat_tracer(struct tracer_stat *trace)
  {
-       struct tracer_stat_session *node, *tmp;
+       struct stat_session *node, *tmp;
  
         mutex_lock(&all_stat_sessions_mutex);
         list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h

index 202274cf7f3d06d1145437ab8cd27d43eaf90e04..f3546a2cd826bc6ae0ecb85c3c631bd411884aab 100644 (file)
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -12,7 +12,7 @@ struct tracer_stat {
         /* The name of your stat file */
         const char              *name;
         /* Iteration over statistic entries */
-       void                    *(*stat_start)(void);
+       void                    *(*stat_start)(struct tracer_stat *trace);
         void                    *(*stat_next)(void *prev, int idx);
         /* Compare two entries for stats sorting */
         int                     (*stat_cmp)(void *p1, void *p2);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c

index 91fd19c2149f5c57e37e808286bc4a3f1929510e..e04b76cc238a69816cd96b57146afbc97cc36da1 100644 (file)
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -321,11 +321,7 @@ static const struct file_operations sysprof_sample_fops = {
  
  void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
  {
-       struct dentry *entry;
  
-       entry = debugfs_create_file("sysprof_sample_period", 0644,
+       trace_create_file("sysprof_sample_period", 0644,
                         d_tracer, NULL, &sysprof_sample_fops);
-       if (entry)
-               return;
-       pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n");
  }
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c

index 797201e4a1376af5987bbce519398b14de3aeada..97fcea4acce156228e29490345373938d92277d8 100644 (file)
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -6,7 +6,7 @@
   */
  
  
-#include <trace/workqueue.h>
+#include <trace/events/workqueue.h>
  #include <linux/list.h>
  #include <linux/percpu.h>
  #include "trace_stat.h"
@@ -16,8 +16,6 @@
  /* A cpu workqueue thread */
  struct cpu_workqueue_stats {
         struct list_head            list;
-/* Useful to know if we print the cpu headers */
-       bool                        first_entry;
         int                         cpu;
         pid_t                       pid;
  /* Can be inserted from interrupt or user context, need to be atomic */
@@ -47,12 +45,11 @@ probe_workqueue_insertion(struct task_struct *wq_thread,
                           struct work_struct *work)
  {
         int cpu = cpumask_first(&wq_thread->cpus_allowed);
-       struct cpu_workqueue_stats *node, *next;
+       struct cpu_workqueue_stats *node;
         unsigned long flags;
  
         spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-       list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
-                                                       list) {
+       list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
                 if (node->pid == wq_thread->pid) {
                         atomic_inc(&node->inserted);
                         goto found;
@@ -69,12 +66,11 @@ probe_workqueue_execution(struct task_struct *wq_thread,
                           struct work_struct *work)
  {
         int cpu = cpumask_first(&wq_thread->cpus_allowed);
-       struct cpu_workqueue_stats *node, *next;
+       struct cpu_workqueue_stats *node;
         unsigned long flags;
  
         spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-       list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
-                                                       list) {
+       list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
                 if (node->pid == wq_thread->pid) {
                         node->executed++;
                         goto found;
@@ -105,8 +101,6 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
         cws->pid = wq_thread->pid;
  
         spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-       if (list_empty(&workqueue_cpu_stat(cpu)->list))
-               cws->first_entry = true;
         list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
         spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
  }
@@ -152,7 +146,7 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
         return ret;
  }
  
-static void *workqueue_stat_start(void)
+static void *workqueue_stat_start(struct tracer_stat *trace)
  {
         int cpu;
         void *ret = NULL;
@@ -191,16 +185,9 @@ static void *workqueue_stat_next(void *prev, int idx)
  static int workqueue_stat_show(struct seq_file *s, void *p)
  {
         struct cpu_workqueue_stats *cws = p;
-       unsigned long flags;
-       int cpu = cws->cpu;
         struct pid *pid;
         struct task_struct *tsk;
  
-       spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-       if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
-               seq_printf(s, "\n");
-       spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
         pid = find_get_pid(cws->pid);
         if (pid) {
                 tsk = get_pid_task(pid, PIDTYPE_PID);
diff --git a/kernel/wait.c b/kernel/wait.c

index 42a2dbc181c89ca708360d84ee53a5ca21640b44..ea7c3b4275cf362f75cee939aa7861f7673b95cc 100644 (file)
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
         if (!list_empty(&wait->task_list))
                 list_del_init(&wait->task_list);
         else if (waitqueue_active(q))
-               __wake_up_common(q, mode, 1, 0, key);
+               __wake_up_locked_key(q, mode, key);
         spin_unlock_irqrestore(&q->lock, flags);
  }
  EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c

index f71fb2a089503534eec8d8790f82d9702a3545cc..0668795d8818477683d184edff057f6fe19144ad 100644 (file)
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,7 +33,8 @@
  #include <linux/kallsyms.h>
  #include <linux/debug_locks.h>
  #include <linux/lockdep.h>
-#include <trace/workqueue.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/workqueue.h>
  
  /*
   * The per-CPU workqueue (if single thread, we always use the first
@@ -124,8 +125,6 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
         return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
  }
  
-DEFINE_TRACE(workqueue_insertion);
-
  static void insert_work(struct cpu_workqueue_struct *cwq,
                         struct work_struct *work, struct list_head *head)
  {
@@ -262,8 +261,6 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
  }
  EXPORT_SYMBOL_GPL(queue_delayed_work_on);
  
-DEFINE_TRACE(workqueue_execution);
-
  static void run_workqueue(struct cpu_workqueue_struct *cwq)
  {
         spin_lock_irq(&cwq->lock);
@@ -753,8 +750,6 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
         return cwq;
  }
  
-DEFINE_TRACE(workqueue_creation);
-
  static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
  {
         struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -860,8 +855,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
  }
  EXPORT_SYMBOL_GPL(__create_workqueue_key);
  
-DEFINE_TRACE(workqueue_destruction);
-
  static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
  {
         /*
diff --git a/lib/dma-debug.c b/lib/dma-debug.c

index 69da09a085a1943ea6a70f4871fd2df1683110fb..ad65fc0317d93b6c1dd20171e625e7e619631d21 100644 (file)
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -23,9 +23,11 @@
  #include <linux/dma-debug.h>
  #include <linux/spinlock.h>
  #include <linux/debugfs.h>
+#include <linux/uaccess.h>
  #include <linux/device.h>
  #include <linux/types.h>
  #include <linux/sched.h>
+#include <linux/ctype.h>
  #include <linux/list.h>
  #include <linux/slab.h>
  
@@ -85,6 +87,7 @@ static u32 show_num_errors = 1;
  
  static u32 num_free_entries;
  static u32 min_free_entries;
+static u32 nr_total_entries;
  
  /* number of preallocated entries requested by kernel cmdline */
  static u32 req_entries;
@@ -97,6 +100,16 @@ static struct dentry *show_all_errors_dent  __read_mostly;
  static struct dentry *show_num_errors_dent  __read_mostly;
  static struct dentry *num_free_entries_dent __read_mostly;
  static struct dentry *min_free_entries_dent __read_mostly;
+static struct dentry *filter_dent           __read_mostly;
+
+/* per-driver filter related state */
+
+#define NAME_MAX_LEN   64
+
+static char                  current_driver_name[NAME_MAX_LEN] __read_mostly;
+static struct device_driver *current_driver                    __read_mostly;
+
+static DEFINE_RWLOCK(driver_name_lock);
  
  static const char *type2name[4] = { "single", "page",
                                     "scather-gather", "coherent" };
@@ -104,6 +117,11 @@ static const char *type2name[4] = { "single", "page",
  static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE",
                                    "DMA_FROM_DEVICE", "DMA_NONE" };
  
+/* little merge helper - remove it after the merge window */
+#ifndef BUS_NOTIFY_UNBOUND_DRIVER
+#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
+#endif
+
  /*
   * The access to some variables in this macro is racy. We can't use atomic_t
   * here because all these variables are exported to debugfs. Some of them even
@@ -121,15 +139,54 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry)
  {
  #ifdef CONFIG_STACKTRACE
         if (entry) {
-               printk(KERN_WARNING "Mapped at:\n");
+               pr_warning("Mapped at:\n");
                 print_stack_trace(&entry->stacktrace, 0);
         }
  #endif
  }
  
+static bool driver_filter(struct device *dev)
+{
+       struct device_driver *drv;
+       unsigned long flags;
+       bool ret;
+
+       /* driver filter off */
+       if (likely(!current_driver_name[0]))
+               return true;
+
+       /* driver filter on and initialized */
+       if (current_driver && dev->driver == current_driver)
+               return true;
+
+       if (current_driver || !current_driver_name[0])
+               return false;
+
+       /* driver filter on but not yet initialized */
+       drv = get_driver(dev->driver);
+       if (!drv)
+               return false;
+
+       /* lock to protect against change of current_driver_name */
+       read_lock_irqsave(&driver_name_lock, flags);
+
+       ret = false;
+       if (drv->name &&
+           strncmp(current_driver_name, drv->name, NAME_MAX_LEN - 1) == 0) {
+               current_driver = drv;
+               ret = true;
+       }
+
+       read_unlock_irqrestore(&driver_name_lock, flags);
+       put_driver(drv);
+
+       return ret;
+}
+
  #define err_printk(dev, entry, format, arg...) do {            \
                 error_count += 1;                               \
-               if (show_all_errors || show_num_errors > 0) {   \
+               if (driver_filter(dev) &&                       \
+                   (show_all_errors || show_num_errors > 0)) { \
                         WARN(1, "%s %s: " format,               \
                              dev_driver_string(dev),            \
                              dev_name(dev) , ## arg);           \
@@ -185,15 +242,50 @@ static void put_hash_bucket(struct hash_bucket *bucket,
  static struct dma_debug_entry *hash_bucket_find(struct hash_bucket *bucket,
                                                 struct dma_debug_entry *ref)
  {
-       struct dma_debug_entry *entry;
+       struct dma_debug_entry *entry, *ret = NULL;
+       int matches = 0, match_lvl, last_lvl = 0;
  
         list_for_each_entry(entry, &bucket->list, list) {
-               if ((entry->dev_addr == ref->dev_addr) &&
-                   (entry->dev == ref->dev))
+               if ((entry->dev_addr != ref->dev_addr) ||
+                   (entry->dev != ref->dev))
+                       continue;
+
+               /*
+                * Some drivers map the same physical address multiple
+                * times. Without a hardware IOMMU this results in the
+                * same device addresses being put into the dma-debug
+                * hash multiple times too. This can result in false
+                * positives being reported. Therfore we implement a
+                * best-fit algorithm here which returns the entry from
+                * the hash which fits best to the reference value
+                * instead of the first-fit.
+                */
+               matches += 1;
+               match_lvl = 0;
+               entry->size      == ref->size      ? ++match_lvl : match_lvl;
+               entry->type      == ref->type      ? ++match_lvl : match_lvl;
+               entry->direction == ref->direction ? ++match_lvl : match_lvl;
+
+               if (match_lvl == 3) {
+                       /* perfect-fit - return the result */
                         return entry;
+               } else if (match_lvl > last_lvl) {
+                       /*
+                        * We found an entry that fits better then the
+                        * previous one
+                        */
+                       last_lvl = match_lvl;
+                       ret      = entry;
+               }
         }
  
-       return NULL;
+       /*
+        * If we have multiple matches but no perfect-fit, just return
+        * NULL.
+        */
+       ret = (matches == 1) ? ret : NULL;
+
+       return ret;
  }
  
  /*
@@ -257,6 +349,21 @@ static void add_dma_entry(struct dma_debug_entry *entry)
         put_hash_bucket(bucket, &flags);
  }
  
+static struct dma_debug_entry *__dma_entry_alloc(void)
+{
+       struct dma_debug_entry *entry;
+
+       entry = list_entry(free_entries.next, struct dma_debug_entry, list);
+       list_del(&entry->list);
+       memset(entry, 0, sizeof(*entry));
+
+       num_free_entries -= 1;
+       if (num_free_entries < min_free_entries)
+               min_free_entries = num_free_entries;
+
+       return entry;
+}
+
  /* struct dma_entry allocator
   *
   * The next two functions implement the allocator for
@@ -270,15 +377,12 @@ static struct dma_debug_entry *dma_entry_alloc(void)
         spin_lock_irqsave(&free_entries_lock, flags);
  
         if (list_empty(&free_entries)) {
-               printk(KERN_ERR "DMA-API: debugging out of memory "
-                               "- disabling\n");
+               pr_err("DMA-API: debugging out of memory - disabling\n");
                 global_disable = true;
                 goto out;
         }
  
-       entry = list_entry(free_entries.next, struct dma_debug_entry, list);
-       list_del(&entry->list);
-       memset(entry, 0, sizeof(*entry));
+       entry = __dma_entry_alloc();
  
  #ifdef CONFIG_STACKTRACE
         entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES;
@@ -286,9 +390,6 @@ static struct dma_debug_entry *dma_entry_alloc(void)
         entry->stacktrace.skip = 2;
         save_stack_trace(&entry->stacktrace);
  #endif
-       num_free_entries -= 1;
-       if (num_free_entries < min_free_entries)
-               min_free_entries = num_free_entries;
  
  out:
         spin_unlock_irqrestore(&free_entries_lock, flags);
@@ -310,6 +411,53 @@ static void dma_entry_free(struct dma_debug_entry *entry)
         spin_unlock_irqrestore(&free_entries_lock, flags);
  }
  
+int dma_debug_resize_entries(u32 num_entries)
+{
+       int i, delta, ret = 0;
+       unsigned long flags;
+       struct dma_debug_entry *entry;
+       LIST_HEAD(tmp);
+
+       spin_lock_irqsave(&free_entries_lock, flags);
+
+       if (nr_total_entries < num_entries) {
+               delta = num_entries - nr_total_entries;
+
+               spin_unlock_irqrestore(&free_entries_lock, flags);
+
+               for (i = 0; i < delta; i++) {
+                       entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+                       if (!entry)
+                               break;
+
+                       list_add_tail(&entry->list, &tmp);
+               }
+
+               spin_lock_irqsave(&free_entries_lock, flags);
+
+               list_splice(&tmp, &free_entries);
+               nr_total_entries += i;
+               num_free_entries += i;
+       } else {
+               delta = nr_total_entries - num_entries;
+
+               for (i = 0; i < delta && !list_empty(&free_entries); i++) {
+                       entry = __dma_entry_alloc();
+                       kfree(entry);
+               }
+
+               nr_total_entries -= i;
+       }
+
+       if (nr_total_entries != num_entries)
+               ret = 1;
+
+       spin_unlock_irqrestore(&free_entries_lock, flags);
+
+       return ret;
+}
+EXPORT_SYMBOL(dma_debug_resize_entries);
+
  /*
   * DMA-API debugging init code
   *
@@ -334,8 +482,7 @@ static int prealloc_memory(u32 num_entries)
         num_free_entries = num_entries;
         min_free_entries = num_entries;
  
-       printk(KERN_INFO "DMA-API: preallocated %d debug entries\n",
-                       num_entries);
+       pr_info("DMA-API: preallocated %d debug entries\n", num_entries);
  
         return 0;
  
@@ -349,11 +496,102 @@ out_err:
         return -ENOMEM;
  }
  
+static ssize_t filter_read(struct file *file, char __user *user_buf,
+                          size_t count, loff_t *ppos)
+{
+       char buf[NAME_MAX_LEN + 1];
+       unsigned long flags;
+       int len;
+
+       if (!current_driver_name[0])
+               return 0;
+
+       /*
+        * We can't copy to userspace directly because current_driver_name can
+        * only be read under the driver_name_lock with irqs disabled. So
+        * create a temporary copy first.
+        */
+       read_lock_irqsave(&driver_name_lock, flags);
+       len = scnprintf(buf, NAME_MAX_LEN + 1, "%s\n", current_driver_name);
+       read_unlock_irqrestore(&driver_name_lock, flags);
+
+       return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t filter_write(struct file *file, const char __user *userbuf,
+                           size_t count, loff_t *ppos)
+{
+       char buf[NAME_MAX_LEN];
+       unsigned long flags;
+       size_t len;
+       int i;
+
+       /*
+        * We can't copy from userspace directly. Access to
+        * current_driver_name is protected with a write_lock with irqs
+        * disabled. Since copy_from_user can fault and may sleep we
+        * need to copy to temporary buffer first
+        */
+       len = min(count, (size_t)(NAME_MAX_LEN - 1));
+       if (copy_from_user(buf, userbuf, len))
+               return -EFAULT;
+
+       buf[len] = 0;
+
+       write_lock_irqsave(&driver_name_lock, flags);
+
+       /*
+        * Now handle the string we got from userspace very carefully.
+        * The rules are:
+        *         - only use the first token we got
+        *         - token delimiter is everything looking like a space
+        *           character (' ', '\n', '\t' ...)
+        *
+        */
+       if (!isalnum(buf[0])) {
+               /*
+                * If the first character userspace gave us is not
+                * alphanumerical then assume the filter should be
+                * switched off.
+                */
+               if (current_driver_name[0])
+                       pr_info("DMA-API: switching off dma-debug driver filter\n");
+               current_driver_name[0] = 0;
+               current_driver = NULL;
+               goto out_unlock;
+       }
+
+       /*
+        * Now parse out the first token and use it as the name for the
+        * driver to filter for.
+        */
+       for (i = 0; i < NAME_MAX_LEN; ++i) {
+               current_driver_name[i] = buf[i];
+               if (isspace(buf[i]) || buf[i] == ' ' || buf[i] == 0)
+                       break;
+       }
+       current_driver_name[i] = 0;
+       current_driver = NULL;
+
+       pr_info("DMA-API: enable driver filter for driver [%s]\n",
+               current_driver_name);
+
+out_unlock:
+       write_unlock_irqrestore(&driver_name_lock, flags);
+
+       return count;
+}
+
+const struct file_operations filter_fops = {
+       .read  = filter_read,
+       .write = filter_write,
+};
+
  static int dma_debug_fs_init(void)
  {
         dma_debug_dent = debugfs_create_dir("dma-api", NULL);
         if (!dma_debug_dent) {
-               printk(KERN_ERR "DMA-API: can not create debugfs directory\n");
+               pr_err("DMA-API: can not create debugfs directory\n");
                 return -ENOMEM;
         }
  
@@ -392,6 +630,11 @@ static int dma_debug_fs_init(void)
         if (!min_free_entries_dent)
                 goto out_err;
  
+       filter_dent = debugfs_create_file("driver_filter", 0644,
+                                         dma_debug_dent, NULL, &filter_fops);
+       if (!filter_dent)
+               goto out_err;
+
         return 0;
  
  out_err:
@@ -400,9 +643,64 @@ out_err:
         return -ENOMEM;
  }
  
+static int device_dma_allocations(struct device *dev)
+{
+       struct dma_debug_entry *entry;
+       unsigned long flags;
+       int count = 0, i;
+
+       local_irq_save(flags);
+
+       for (i = 0; i < HASH_SIZE; ++i) {
+               spin_lock(&dma_entry_hash[i].lock);
+               list_for_each_entry(entry, &dma_entry_hash[i].list, list) {
+                       if (entry->dev == dev)
+                               count += 1;
+               }
+               spin_unlock(&dma_entry_hash[i].lock);
+       }
+
+       local_irq_restore(flags);
+
+       return count;
+}
+
+static int dma_debug_device_change(struct notifier_block *nb,
+                                   unsigned long action, void *data)
+{
+       struct device *dev = data;
+       int count;
+
+
+       switch (action) {
+       case BUS_NOTIFY_UNBOUND_DRIVER:
+               count = device_dma_allocations(dev);
+               if (count == 0)
+                       break;
+               err_printk(dev, NULL, "DMA-API: device driver has pending "
+                               "DMA allocations while released from device "
+                               "[count=%d]\n", count);
+               break;
+       default:
+               break;
+       }
+
+       return 0;
+}
+
  void dma_debug_add_bus(struct bus_type *bus)
  {
-       /* FIXME: register notifier */
+       struct notifier_block *nb;
+
+       nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL);
+       if (nb == NULL) {
+               pr_err("dma_debug_add_bus: out of memory\n");
+               return;
+       }
+
+       nb->notifier_call = dma_debug_device_change;
+
+       bus_register_notifier(bus, nb);
  }
  
  /*
@@ -421,8 +719,7 @@ void dma_debug_init(u32 num_entries)
         }
  
         if (dma_debug_fs_init() != 0) {
-               printk(KERN_ERR "DMA-API: error creating debugfs entries "
-                               "- disabling\n");
+               pr_err("DMA-API: error creating debugfs entries - disabling\n");
                 global_disable = true;
  
                 return;
@@ -432,14 +729,15 @@ void dma_debug_init(u32 num_entries)
                 num_entries = req_entries;
  
         if (prealloc_memory(num_entries) != 0) {
-               printk(KERN_ERR "DMA-API: debugging out of memory error "
-                               "- disabled\n");
+               pr_err("DMA-API: debugging out of memory error - disabled\n");
                 global_disable = true;
  
                 return;
         }
  
-       printk(KERN_INFO "DMA-API: debugging enabled by kernel config\n");
+       nr_total_entries = num_free_entries;
+
+       pr_info("DMA-API: debugging enabled by kernel config\n");
  }
  
  static __init int dma_debug_cmdline(char *str)
@@ -448,8 +746,7 @@ static __init int dma_debug_cmdline(char *str)
                 return -EINVAL;
  
         if (strncmp(str, "off", 3) == 0) {
-               printk(KERN_INFO "DMA-API: debugging disabled on kernel "
-                                "command line\n");
+               pr_info("DMA-API: debugging disabled on kernel command line\n");
                 global_disable = true;
         }
  
@@ -723,15 +1020,15 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
                 entry->type           = dma_debug_sg;
                 entry->dev            = dev;
                 entry->paddr          = sg_phys(s);
-               entry->size           = s->length;
-               entry->dev_addr       = s->dma_address;
+               entry->size           = sg_dma_len(s);
+               entry->dev_addr       = sg_dma_address(s);
                 entry->direction      = direction;
                 entry->sg_call_ents   = nents;
                 entry->sg_mapped_ents = mapped_ents;
  
                 if (!PageHighMem(sg_page(s))) {
                         check_for_stack(dev, sg_virt(s));
-                       check_for_illegal_area(dev, sg_virt(s), s->length);
+                       check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s));
                 }
  
                 add_dma_entry(entry);
@@ -739,13 +1036,33 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
  }
  EXPORT_SYMBOL(debug_dma_map_sg);
  
+static int get_nr_mapped_entries(struct device *dev, struct scatterlist *s)
+{
+       struct dma_debug_entry *entry, ref;
+       struct hash_bucket *bucket;
+       unsigned long flags;
+       int mapped_ents;
+
+       ref.dev      = dev;
+       ref.dev_addr = sg_dma_address(s);
+       ref.size     = sg_dma_len(s),
+
+       bucket       = get_hash_bucket(&ref, &flags);
+       entry        = hash_bucket_find(bucket, &ref);
+       mapped_ents  = 0;
+
+       if (entry)
+               mapped_ents = entry->sg_mapped_ents;
+       put_hash_bucket(bucket, &flags);
+
+       return mapped_ents;
+}
+
  void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
                         int nelems, int dir)
  {
-       struct dma_debug_entry *entry;
         struct scatterlist *s;
         int mapped_ents = 0, i;
-       unsigned long flags;
  
         if (unlikely(global_disable))
                 return;
@@ -756,8 +1073,8 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
                         .type           = dma_debug_sg,
                         .dev            = dev,
                         .paddr          = sg_phys(s),
-                       .dev_addr       = s->dma_address,
-                       .size           = s->length,
+                       .dev_addr       = sg_dma_address(s),
+                       .size           = sg_dma_len(s),
                         .direction      = dir,
                         .sg_call_ents   = 0,
                 };
@@ -765,14 +1082,9 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
                 if (mapped_ents && i >= mapped_ents)
                         break;
  
-               if (mapped_ents == 0) {
-                       struct hash_bucket *bucket;
+               if (!i) {
                         ref.sg_call_ents = nelems;
-                       bucket = get_hash_bucket(&ref, &flags);
-                       entry = hash_bucket_find(bucket, &ref);
-                       if (entry)
-                               mapped_ents = entry->sg_mapped_ents;
-                       put_hash_bucket(bucket, &flags);
+                       mapped_ents = get_nr_mapped_entries(dev, s);
                 }
  
                 check_unmap(&ref);
@@ -874,14 +1186,20 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
                                int nelems, int direction)
  {
         struct scatterlist *s;
-       int i;
+       int mapped_ents = 0, i;
  
         if (unlikely(global_disable))
                 return;
  
         for_each_sg(sg, s, nelems, i) {
-               check_sync(dev, s->dma_address, s->dma_length, 0,
-                               direction, true);
+               if (!i)
+                       mapped_ents = get_nr_mapped_entries(dev, s);
+
+               if (i >= mapped_ents)
+                       break;
+
+               check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0,
+                          direction, true);
         }
  }
  EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu);
@@ -890,15 +1208,39 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
                                   int nelems, int direction)
  {
         struct scatterlist *s;
-       int i;
+       int mapped_ents = 0, i;
  
         if (unlikely(global_disable))
                 return;
  
         for_each_sg(sg, s, nelems, i) {
-               check_sync(dev, s->dma_address, s->dma_length, 0,
-                               direction, false);
+               if (!i)
+                       mapped_ents = get_nr_mapped_entries(dev, s);
+
+               if (i >= mapped_ents)
+                       break;
+
+               check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0,
+                          direction, false);
         }
  }
  EXPORT_SYMBOL(debug_dma_sync_sg_for_device);
  
+static int __init dma_debug_driver_setup(char *str)
+{
+       int i;
+
+       for (i = 0; i < NAME_MAX_LEN - 1; ++i, ++str) {
+               current_driver_name[i] = *str;
+               if (*str == 0)
+                       break;
+       }
+
+       if (current_driver_name[0])
+               pr_info("DMA-API: enable driver filter for driver [%s]\n",
+                       current_driver_name);
+
+
+       return 1;
+}
+__setup("dma_debug_driver=", dma_debug_driver_setup);
diff --git a/lib/swiotlb.c b/lib/swiotlb.c

index 2b0b5a7d2ced165b1e2b83e2df888c237349f23f..bffe6d7ef9d9a01a52db11e12e54207bc9b41a0c 100644 (file)
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -60,8 +60,8 @@ enum dma_sync_target {
  int swiotlb_force;
  
  /*
- * Used to do a quick range check in swiotlb_unmap_single and
- * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
+ * Used to do a quick range check in unmap_single and
+ * sync_single_*, to see if the memory was in fact allocated by this
   * API.
   */
  static char *io_tlb_start, *io_tlb_end;
@@ -129,7 +129,7 @@ dma_addr_t __weak swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
         return paddr;
  }
  
-phys_addr_t __weak swiotlb_bus_to_phys(dma_addr_t baddr)
+phys_addr_t __weak swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
  {
         return baddr;
  }
@@ -140,9 +140,15 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
         return swiotlb_phys_to_bus(hwdev, virt_to_phys(address));
  }
  
-static void *swiotlb_bus_to_virt(dma_addr_t address)
+void * __weak swiotlb_bus_to_virt(struct device *hwdev, dma_addr_t address)
  {
-       return phys_to_virt(swiotlb_bus_to_phys(address));
+       return phys_to_virt(swiotlb_bus_to_phys(hwdev, address));
+}
+
+int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev,
+                                              dma_addr_t addr, size_t size)
+{
+       return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size);
  }
  
  int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size)
@@ -309,10 +315,10 @@ cleanup1:
         return -ENOMEM;
  }
  
-static int
+static inline int
  address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size)
  {
-       return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size);
+       return swiotlb_arch_address_needs_mapping(hwdev, addr, size);
  }
  
  static inline int range_needs_mapping(phys_addr_t paddr, size_t size)
@@ -341,7 +347,7 @@ static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
                 unsigned long flags;
  
                 while (size) {
-                       sz = min(PAGE_SIZE - offset, size);
+                       sz = min_t(size_t, PAGE_SIZE - offset, size);
  
                         local_irq_save(flags);
                         buffer = kmap_atomic(pfn_to_page(pfn),
@@ -476,7 +482,7 @@ found:
   * dma_addr is the kernel virtual address of the bounce buffer to unmap.
   */
  static void
-unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
+do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
  {
         unsigned long flags;
         int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
@@ -560,7 +566,6 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
                                    size)) {
                 /*
                  * The allocated memory isn't reachable by the device.
-                * Fall back on swiotlb_map_single().
                  */
                 free_pages((unsigned long) ret, order);
                 ret = NULL;
@@ -568,9 +573,8 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
         if (!ret) {
                 /*
                  * We are either out of memory or the device can't DMA
-                * to GFP_DMA memory; fall back on
-                * swiotlb_map_single(), which will grab memory from
-                * the lowest available address range.
+                * to GFP_DMA memory; fall back on map_single(), which
+                * will grab memory from the lowest available address range.
                  */
                 ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE);
                 if (!ret)
@@ -587,7 +591,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
                        (unsigned long long)dev_addr);
  
                 /* DMA_TO_DEVICE to avoid memcpy in unmap_single */
-               unmap_single(hwdev, ret, size, DMA_TO_DEVICE);
+               do_unmap_single(hwdev, ret, size, DMA_TO_DEVICE);
                 return NULL;
         }
         *dma_handle = dev_addr;
@@ -604,7 +608,7 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
                 free_pages((unsigned long) vaddr, get_order(size));
         else
                 /* DMA_TO_DEVICE to avoid memcpy in unmap_single */
-               unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE);
+               do_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE);
  }
  EXPORT_SYMBOL(swiotlb_free_coherent);
  
@@ -634,7 +638,7 @@ swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
   * physical address to use is returned.
   *
   * Once the device is given the dma address, the device owns this memory until
- * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
+ * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed.
   */
  dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
                             unsigned long offset, size_t size,
@@ -642,18 +646,17 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
                             struct dma_attrs *attrs)
  {
         phys_addr_t phys = page_to_phys(page) + offset;
-       void *ptr = page_address(page) + offset;
         dma_addr_t dev_addr = swiotlb_phys_to_bus(dev, phys);
         void *map;
  
         BUG_ON(dir == DMA_NONE);
         /*
-        * If the pointer passed in happens to be in the device's DMA window,
+        * If the address happens to be in the device's DMA window,
          * we can safely return the device addr and not worry about bounce
          * buffering it.
          */
         if (!address_needs_mapping(dev, dev_addr, size) &&
-           !range_needs_mapping(virt_to_phys(ptr), size))
+           !range_needs_mapping(phys, size))
                 return dev_addr;
  
         /*
@@ -679,23 +682,35 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page);
  
  /*
   * Unmap a single streaming mode DMA translation.  The dma_addr and size must
- * match what was provided for in a previous swiotlb_map_single call.  All
+ * match what was provided for in a previous swiotlb_map_page call.  All
   * other usages are undefined.
   *
   * After this call, reads by the cpu to the buffer are guaranteed to see
   * whatever the device wrote there.
   */
+static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
+                        size_t size, int dir)
+{
+       char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr);
+
+       BUG_ON(dir == DMA_NONE);
+
+       if (is_swiotlb_buffer(dma_addr)) {
+               do_unmap_single(hwdev, dma_addr, size, dir);
+               return;
+       }
+
+       if (dir != DMA_FROM_DEVICE)
+               return;
+
+       dma_mark_clean(dma_addr, size);
+}
+
  void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
                         size_t size, enum dma_data_direction dir,
                         struct dma_attrs *attrs)
  {
-       char *dma_addr = swiotlb_bus_to_virt(dev_addr);
-
-       BUG_ON(dir == DMA_NONE);
-       if (is_swiotlb_buffer(dma_addr))
-               unmap_single(hwdev, dma_addr, size, dir);
-       else if (dir == DMA_FROM_DEVICE)
-               dma_mark_clean(dma_addr, size);
+       unmap_single(hwdev, dev_addr, size, dir);
  }
  EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
  
@@ -703,7 +718,7 @@ EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
   * Make physical memory consistent for a single streaming mode DMA translation
   * after a transfer.
   *
- * If you perform a swiotlb_map_single() but wish to interrogate the buffer
+ * If you perform a swiotlb_map_page() but wish to interrogate the buffer
   * using the cpu, yet do not wish to teardown the dma mapping, you must
   * call this function before doing so.  At the next point you give the dma
   * address back to the card, you must first perform a
@@ -713,13 +728,19 @@ static void
  swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
                     size_t size, int dir, int target)
  {
-       char *dma_addr = swiotlb_bus_to_virt(dev_addr);
+       char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr);
  
         BUG_ON(dir == DMA_NONE);
-       if (is_swiotlb_buffer(dma_addr))
+
+       if (is_swiotlb_buffer(dma_addr)) {
                 sync_single(hwdev, dma_addr, size, dir, target);
-       else if (dir == DMA_FROM_DEVICE)
-               dma_mark_clean(dma_addr, size);
+               return;
+       }
+
+       if (dir != DMA_FROM_DEVICE)
+               return;
+
+       dma_mark_clean(dma_addr, size);
  }
  
  void
@@ -746,13 +767,7 @@ swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr,
                           unsigned long offset, size_t size,
                           int dir, int target)
  {
-       char *dma_addr = swiotlb_bus_to_virt(dev_addr) + offset;
-
-       BUG_ON(dir == DMA_NONE);
-       if (is_swiotlb_buffer(dma_addr))
-               sync_single(hwdev, dma_addr, size, dir, target);
-       else if (dir == DMA_FROM_DEVICE)
-               dma_mark_clean(dma_addr, size);
+       swiotlb_sync_single(hwdev, dev_addr + offset, size, dir, target);
  }
  
  void
@@ -777,7 +792,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device);
  
  /*
   * Map a set of buffers described by scatterlist in streaming mode for DMA.
- * This is the scatter-gather version of the above swiotlb_map_single
+ * This is the scatter-gather version of the above swiotlb_map_page
   * interface.  Here the scatter gather list elements are each tagged with the
   * appropriate dma address and length.  They are obtained via
   * sg_dma_{address,length}(SG).
@@ -788,7 +803,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device);
   *       The routine returns the number of addr/length pairs actually
   *       used, at most nents.
   *
- * Device ownership issues as mentioned above for swiotlb_map_single are the
+ * Device ownership issues as mentioned above for swiotlb_map_page are the
   * same here.
   */
  int
@@ -836,7 +851,7 @@ EXPORT_SYMBOL(swiotlb_map_sg);
  
  /*
   * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
- * concerning calls here are the same as for swiotlb_unmap_single() above.
+ * concerning calls here are the same as for swiotlb_unmap_page() above.
   */
  void
  swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
@@ -847,13 +862,9 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
  
         BUG_ON(dir == DMA_NONE);
  
-       for_each_sg(sgl, sg, nelems, i) {
-               if (sg->dma_address != swiotlb_phys_to_bus(hwdev, sg_phys(sg)))
-                       unmap_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
-                                    sg->dma_length, dir);
-               else if (dir == DMA_FROM_DEVICE)
-                       dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length);
-       }
+       for_each_sg(sgl, sg, nelems, i)
+               unmap_single(hwdev, sg->dma_address, sg->dma_length, dir);
+
  }
  EXPORT_SYMBOL(swiotlb_unmap_sg_attrs);
  
@@ -879,15 +890,9 @@ swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
         struct scatterlist *sg;
         int i;
  
-       BUG_ON(dir == DMA_NONE);
-
-       for_each_sg(sgl, sg, nelems, i) {
-               if (sg->dma_address != swiotlb_phys_to_bus(hwdev, sg_phys(sg)))
-                       sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
+       for_each_sg(sgl, sg, nelems, i)
+               swiotlb_sync_single(hwdev, sg->dma_address,
                                     sg->dma_length, dir, target);
-               else if (dir == DMA_FROM_DEVICE)
-                       dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length);
-       }
  }
  
  void
diff --git a/lib/vsprintf.c b/lib/vsprintf.c

index 7536acea135ba4069c1dc04ad36d5024ac747d8c..756ccafa9cec04ada5c940f7031e6dd56aeb7837 100644 (file)
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -408,6 +408,8 @@ enum format_type {
         FORMAT_TYPE_LONG_LONG,
         FORMAT_TYPE_ULONG,
         FORMAT_TYPE_LONG,
+       FORMAT_TYPE_UBYTE,
+       FORMAT_TYPE_BYTE,
         FORMAT_TYPE_USHORT,
         FORMAT_TYPE_SHORT,
         FORMAT_TYPE_UINT,
@@ -573,12 +575,15 @@ static char *string(char *buf, char *end, char *s, struct printf_spec spec)
  }
  
  static char *symbol_string(char *buf, char *end, void *ptr,
-                               struct printf_spec spec)
+                               struct printf_spec spec, char ext)
  {
         unsigned long value = (unsigned long) ptr;
  #ifdef CONFIG_KALLSYMS
         char sym[KSYM_SYMBOL_LEN];
-       sprint_symbol(sym, value);
+       if (ext != 'f')
+               sprint_symbol(sym, value);
+       else
+               kallsyms_lookup(value, NULL, NULL, NULL, sym);
         return string(buf, end, sym, spec);
  #else
         spec.field_width = 2*sizeof(void *);
@@ -690,7 +695,8 @@ static char *ip4_addr_string(char *buf, char *end, u8 *addr,
   *
   * Right now we handle:
   *
- * - 'F' For symbolic function descriptor pointers
+ * - 'F' For symbolic function descriptor pointers with offset
+ * - 'f' For simple symbolic function names without offset
   * - 'S' For symbolic direct pointers
   * - 'R' For a struct resource pointer, it prints the range of
   *       addresses (not the name nor the flags)
@@ -713,10 +719,11 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr,
  
         switch (*fmt) {
         case 'F':
+       case 'f':
                 ptr = dereference_function_descriptor(ptr);
                 /* Fallthrough */
         case 'S':
-               return symbol_string(buf, end, ptr, spec);
+               return symbol_string(buf, end, ptr, spec, *fmt);
         case 'R':
                 return resource_string(buf, end, ptr, spec);
         case 'm':
@@ -853,11 +860,15 @@ qualifier:
         spec->qualifier = -1;
         if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' ||
             *fmt == 'Z' || *fmt == 'z' || *fmt == 't') {
-               spec->qualifier = *fmt;
-               ++fmt;
-               if (spec->qualifier == 'l' && *fmt == 'l') {
-                       spec->qualifier = 'L';
-                       ++fmt;
+               spec->qualifier = *fmt++;
+               if (unlikely(spec->qualifier == *fmt)) {
+                       if (spec->qualifier == 'l') {
+                               spec->qualifier = 'L';
+                               ++fmt;
+                       } else if (spec->qualifier == 'h') {
+                               spec->qualifier = 'H';
+                               ++fmt;
+                       }
                 }
         }
  
@@ -919,6 +930,11 @@ qualifier:
                 spec->type = FORMAT_TYPE_SIZE_T;
         } else if (spec->qualifier == 't') {
                 spec->type = FORMAT_TYPE_PTRDIFF;
+       } else if (spec->qualifier == 'H') {
+               if (spec->flags & SIGN)
+                       spec->type = FORMAT_TYPE_BYTE;
+               else
+                       spec->type = FORMAT_TYPE_UBYTE;
         } else if (spec->qualifier == 'h') {
                 if (spec->flags & SIGN)
                         spec->type = FORMAT_TYPE_SHORT;
@@ -943,7 +959,8 @@ qualifier:
   *
   * This function follows C99 vsnprintf, but has some extensions:
   * %pS output the name of a text symbol
- * %pF output the name of a function pointer
+ * %pF output the name of a function pointer with its offset
+ * %pf output the name of a function pointer without its offset
   * %pR output the address range in a struct resource
   *
   * The return value is the number of characters which would
@@ -1087,6 +1104,12 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
                         case FORMAT_TYPE_PTRDIFF:
                                 num = va_arg(args, ptrdiff_t);
                                 break;
+                       case FORMAT_TYPE_UBYTE:
+                               num = (unsigned char) va_arg(args, int);
+                               break;
+                       case FORMAT_TYPE_BYTE:
+                               num = (signed char) va_arg(args, int);
+                               break;
                         case FORMAT_TYPE_USHORT:
                                 num = (unsigned short) va_arg(args, int);
                                 break;
@@ -1363,6 +1386,10 @@ do {                                                                     \
                         case FORMAT_TYPE_PTRDIFF:
                                 save_arg(ptrdiff_t);
                                 break;
+                       case FORMAT_TYPE_UBYTE:
+                       case FORMAT_TYPE_BYTE:
+                               save_arg(char);
+                               break;
                         case FORMAT_TYPE_USHORT:
                         case FORMAT_TYPE_SHORT:
                                 save_arg(short);
@@ -1391,7 +1418,8 @@ EXPORT_SYMBOL_GPL(vbin_printf);
   *
   * The format follows C99 vsnprintf, but has some extensions:
   * %pS output the name of a text symbol
- * %pF output the name of a function pointer
+ * %pF output the name of a function pointer with its offset
+ * %pf output the name of a function pointer without its offset
   * %pR output the address range in a struct resource
   * %n is ignored
   *
@@ -1538,6 +1566,12 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
                         case FORMAT_TYPE_PTRDIFF:
                                 num = get_arg(ptrdiff_t);
                                 break;
+                       case FORMAT_TYPE_UBYTE:
+                               num = get_arg(unsigned char);
+                               break;
+                       case FORMAT_TYPE_BYTE:
+                               num = get_arg(signed char);
+                               break;
                         case FORMAT_TYPE_USHORT:
                                 num = get_arg(unsigned short);
                                 break;
diff --git a/mm/bounce.c b/mm/bounce.c

index e590272fe7a8f3e40acb21059bb0082f74354f26..65f5e17e411aaf78913a41129d8d63a8a7503a81 100644 (file)
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -14,16 +14,15 @@
  #include <linux/hash.h>
  #include <linux/highmem.h>
  #include <linux/blktrace_api.h>
-#include <trace/block.h>
  #include <asm/tlbflush.h>
  
+#include <trace/events/block.h>
+
  #define POOL_SIZE      64
  #define ISA_POOL_SIZE  16
  
  static mempool_t *page_pool, *isa_page_pool;
  
-DEFINE_TRACE(block_bio_bounce);
-
  #ifdef CONFIG_HIGHMEM
  static __init int init_emergency_pool(void)
  {
diff --git a/mm/mlock.c b/mm/mlock.c

index cbe9e0581b75dcaf06335ccc017a68789d6247d2..ac130433c7d35da275b1ad081bfddd9fbf017173 100644 (file)
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -629,52 +629,43 @@ void user_shm_unlock(size_t size, struct user_struct *user)
         free_uid(user);
  }
  
-void *alloc_locked_buffer(size_t size)
+int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
+                         size_t size)
  {
-       unsigned long rlim, vm, pgsz;
-       void *buffer = NULL;
+       unsigned long lim, vm, pgsz;
+       int error = -ENOMEM;
  
         pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
  
-       down_write(&current->mm->mmap_sem);
-
-       rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
-       vm   = current->mm->total_vm + pgsz;
-       if (rlim < vm)
-               goto out;
+       down_write(&mm->mmap_sem);
  
-       rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
-       vm   = current->mm->locked_vm + pgsz;
-       if (rlim < vm)
+       lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+       vm   = mm->total_vm + pgsz;
+       if (lim < vm)
                 goto out;
  
-       buffer = kzalloc(size, GFP_KERNEL);
-       if (!buffer)
+       lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+       vm   = mm->locked_vm + pgsz;
+       if (lim < vm)
                 goto out;
  
-       current->mm->total_vm  += pgsz;
-       current->mm->locked_vm += pgsz;
+       mm->total_vm  += pgsz;
+       mm->locked_vm += pgsz;
  
+       error = 0;
   out:
-       up_write(&current->mm->mmap_sem);
-       return buffer;
+       up_write(&mm->mmap_sem);
+       return error;
  }
  
-void release_locked_buffer(void *buffer, size_t size)
+void refund_locked_memory(struct mm_struct *mm, size_t size)
  {
         unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
  
-       down_write(&current->mm->mmap_sem);
-
-       current->mm->total_vm  -= pgsz;
-       current->mm->locked_vm -= pgsz;
-
-       up_write(&current->mm->mmap_sem);
-}
+       down_write(&mm->mmap_sem);
  
-void free_locked_buffer(void *buffer, size_t size)
-{
-       release_locked_buffer(buffer, size);
+       mm->total_vm  -= pgsz;
+       mm->locked_vm -= pgsz;
  
-       kfree(buffer);
+       up_write(&mm->mmap_sem);
  }
diff --git a/mm/mmap.c b/mm/mmap.c

index 6b7b1a95944bf267bd21cd981703280fe25a39e9..8101de490c73941ab8815733a3899c614cdb8cb7 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,7 @@
  #include <linux/mempolicy.h>
  #include <linux/rmap.h>
  #include <linux/mmu_notifier.h>
+#include <linux/perf_counter.h>
  
  #include <asm/uaccess.h>
  #include <asm/cacheflush.h>
@@ -1219,6 +1220,8 @@ munmap_back:
         if (correct_wcount)
                 atomic_inc(&inode->i_writecount);
  out:
+       perf_counter_mmap(vma);
+
         mm->total_vm += len >> PAGE_SHIFT;
         vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
         if (vm_flags & VM_LOCKED) {
@@ -2305,6 +2308,8 @@ int install_special_mapping(struct mm_struct *mm,
  
         mm->total_vm += len >> PAGE_SHIFT;
  
+       perf_counter_mmap(vma);
+
         return 0;
  }
  
diff --git a/mm/mprotect.c b/mm/mprotect.c

index 258197b76fb4142d01b81d7871fddbca80510114..d80311baeb2d27f535ebfac2dd2976c60e839708 100644 (file)
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,6 +23,7 @@
  #include <linux/swapops.h>
  #include <linux/mmu_notifier.h>
  #include <linux/migrate.h>
+#include <linux/perf_counter.h>
  #include <asm/uaccess.h>
  #include <asm/pgtable.h>
  #include <asm/cacheflush.h>
@@ -299,6 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
                 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
                 if (error)
                         goto out;
+               perf_counter_mmap(vma);
                 nstart = tmp;
  
                 if (nstart < prev->vm_end)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index fe753ecf2aa5fd234dedb6916c333055bcfe8b36..474c7e9dd51ac66f97b027bf5b384a1bacd42da7 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -149,10 +149,6 @@ static unsigned long __meminitdata dma_reserve;
    static int __meminitdata nr_nodemap_entries;
    static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
    static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-  static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
-  static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
-#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
    static unsigned long __initdata required_kernelcore;
    static unsigned long __initdata required_movablecore;
    static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
@@ -3102,64 +3098,6 @@ void __init sparse_memory_present_with_active_regions(int nid)
                                 early_node_map[i].end_pfn);
  }
  
-/**
- * push_node_boundaries - Push node boundaries to at least the requested boundary
- * @nid: The nid of the node to push the boundary for
- * @start_pfn: The start pfn of the node
- * @end_pfn: The end pfn of the node
- *
- * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
- * time. Specifically, on x86_64, SRAT will report ranges that can potentially
- * be hotplugged even though no physical memory exists. This function allows
- * an arch to push out the node boundaries so mem_map is allocated that can
- * be used later.
- */
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-void __init push_node_boundaries(unsigned int nid,
-               unsigned long start_pfn, unsigned long end_pfn)
-{
-       mminit_dprintk(MMINIT_TRACE, "zoneboundary",
-                       "Entering push_node_boundaries(%u, %lu, %lu)\n",
-                       nid, start_pfn, end_pfn);
-
-       /* Initialise the boundary for this node if necessary */
-       if (node_boundary_end_pfn[nid] == 0)
-               node_boundary_start_pfn[nid] = -1UL;
-
-       /* Update the boundaries */
-       if (node_boundary_start_pfn[nid] > start_pfn)
-               node_boundary_start_pfn[nid] = start_pfn;
-       if (node_boundary_end_pfn[nid] < end_pfn)
-               node_boundary_end_pfn[nid] = end_pfn;
-}
-
-/* If necessary, push the node boundary out for reserve hotadd */
-static void __meminit account_node_boundary(unsigned int nid,
-               unsigned long *start_pfn, unsigned long *end_pfn)
-{
-       mminit_dprintk(MMINIT_TRACE, "zoneboundary",
-                       "Entering account_node_boundary(%u, %lu, %lu)\n",
-                       nid, *start_pfn, *end_pfn);
-
-       /* Return if boundary information has not been provided */
-       if (node_boundary_end_pfn[nid] == 0)
-               return;
-
-       /* Check the boundaries and update if necessary */
-       if (node_boundary_start_pfn[nid] < *start_pfn)
-               *start_pfn = node_boundary_start_pfn[nid];
-       if (node_boundary_end_pfn[nid] > *end_pfn)
-               *end_pfn = node_boundary_end_pfn[nid];
-}
-#else
-void __init push_node_boundaries(unsigned int nid,
-               unsigned long start_pfn, unsigned long end_pfn) {}
-
-static void __meminit account_node_boundary(unsigned int nid,
-               unsigned long *start_pfn, unsigned long *end_pfn) {}
-#endif
-
-
  /**
   * get_pfn_range_for_nid - Return the start and end page frames for a node
   * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
@@ -3185,9 +3123,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
  
         if (*start_pfn == -1UL)
                 *start_pfn = 0;
-
-       /* Push the node boundaries out if requested */
-       account_node_boundary(nid, start_pfn, end_pfn);
  }
  
  /*
@@ -3793,10 +3728,6 @@ void __init remove_all_active_ranges(void)
  {
         memset(early_node_map, 0, sizeof(early_node_map));
         nr_nodemap_entries = 0;
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-       memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
-       memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
-#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
  }
  
  /* Compare two active node_active_regions */
diff --git a/mm/percpu.c b/mm/percpu.c

index 1aa5d8fbca121d434f3cb812e10569f1b8e05f82..c0b2c1a76e81c280398b27ed310b1e03500e3a23 100644 (file)
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -23,7 +23,7 @@
   * Allocation is done in offset-size areas of single unit space.  Ie,
   * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
   * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
- * percpu base registers UNIT_SIZE apart.
+ * percpu base registers pcpu_unit_size apart.
   *
   * There are usually many small percpu allocations many of them as
   * small as 4 bytes.  The allocator organizes chunks into lists
@@ -38,8 +38,8 @@
   * region and negative allocated.  Allocation inside a chunk is done
   * by scanning this map sequentially and serving the first matching
   * entry.  This is mostly copied from the percpu_modalloc() allocator.
- * Chunks are also linked into a rb tree to ease address to chunk
- * mapping during free.
+ * Chunks can be determined from the address using the index field
+ * in the page struct. The index field contains a pointer to the chunk.
   *
   * To use this allocator, arch code should do the followings.
   *
@@ -61,7 +61,6 @@
  #include <linux/mutex.h>
  #include <linux/percpu.h>
  #include <linux/pfn.h>
-#include <linux/rbtree.h>
  #include <linux/slab.h>
  #include <linux/spinlock.h>
  #include <linux/vmalloc.h>
@@ -88,7 +87,6 @@
  
  struct pcpu_chunk {
         struct list_head        list;           /* linked to pcpu_slot lists */
-       struct rb_node          rb_node;        /* key is chunk->vm->addr */
         int                     free_size;      /* free bytes in the chunk */
         int                     contig_hint;    /* max contiguous size hint */
         struct vm_struct        *vm;            /* mapped vmalloc region */
@@ -110,9 +108,21 @@ static size_t pcpu_chunk_struct_size __read_mostly;
  void *pcpu_base_addr __read_mostly;
  EXPORT_SYMBOL_GPL(pcpu_base_addr);
  
-/* optional reserved chunk, only accessible for reserved allocations */
+/*
+ * The first chunk which always exists.  Note that unlike other
+ * chunks, this one can be allocated and mapped in several different
+ * ways and thus often doesn't live in the vmalloc area.
+ */
+static struct pcpu_chunk *pcpu_first_chunk;
+
+/*
+ * Optional reserved chunk.  This chunk reserves part of the first
+ * chunk and serves it for reserved allocations.  The amount of
+ * reserved offset is in pcpu_reserved_chunk_limit.  When reserved
+ * area doesn't exist, the following variables contain NULL and 0
+ * respectively.
+ */
  static struct pcpu_chunk *pcpu_reserved_chunk;
-/* offset limit of the reserved chunk */
  static int pcpu_reserved_chunk_limit;
  
  /*
@@ -121,7 +131,7 @@ static int pcpu_reserved_chunk_limit;
   * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
   * protects allocation/reclaim paths, chunks and chunk->page arrays.
   * The latter is a spinlock and protects the index data structures -
- * chunk slots, rbtree, chunks and area maps in chunks.
+ * chunk slots, chunks and area maps in chunks.
   *
   * During allocation, pcpu_alloc_mutex is kept locked all the time and
   * pcpu_lock is grabbed and released as necessary.  All actual memory
@@ -140,7 +150,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex);      /* protects whole alloc and reclaim */
  static DEFINE_SPINLOCK(pcpu_lock);     /* protects index data structures */
  
  static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
-static struct rb_root pcpu_addr_root = RB_ROOT;        /* chunks by address */
  
  /* reclaim work to release fully free chunks, scheduled from free path */
  static void pcpu_reclaim(struct work_struct *work);
@@ -191,6 +200,18 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
         return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
  }
  
+/* set the pointer to a chunk in a page struct */
+static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
+{
+       page->index = (unsigned long)pcpu;
+}
+
+/* obtain pointer to a chunk from a page struct */
+static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
+{
+       return (struct pcpu_chunk *)page->index;
+}
+
  /**
   * pcpu_mem_alloc - allocate memory
   * @size: bytes to allocate
@@ -257,93 +278,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
         }
  }
  
-static struct rb_node **pcpu_chunk_rb_search(void *addr,
-                                            struct rb_node **parentp)
-{
-       struct rb_node **p = &pcpu_addr_root.rb_node;
-       struct rb_node *parent = NULL;
-       struct pcpu_chunk *chunk;
-
-       while (*p) {
-               parent = *p;
-               chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
-
-               if (addr < chunk->vm->addr)
-                       p = &(*p)->rb_left;
-               else if (addr > chunk->vm->addr)
-                       p = &(*p)->rb_right;
-               else
-                       break;
-       }
-
-       if (parentp)
-               *parentp = parent;
-       return p;
-}
-
  /**
- * pcpu_chunk_addr_search - search for chunk containing specified address
- * @addr: address to search for
- *
- * Look for chunk which might contain @addr.  More specifically, it
- * searchs for the chunk with the highest start address which isn't
- * beyond @addr.
- *
- * CONTEXT:
- * pcpu_lock.
+ * pcpu_chunk_addr_search - determine chunk containing specified address
+ * @addr: address for which the chunk needs to be determined.
   *
   * RETURNS:
   * The address of the found chunk.
   */
  static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
  {
-       struct rb_node *n, *parent;
-       struct pcpu_chunk *chunk;
+       void *first_start = pcpu_first_chunk->vm->addr;
  
-       /* is it in the reserved chunk? */
-       if (pcpu_reserved_chunk) {
-               void *start = pcpu_reserved_chunk->vm->addr;
-
-               if (addr >= start && addr < start + pcpu_reserved_chunk_limit)
+       /* is it in the first chunk? */
+       if (addr >= first_start && addr < first_start + pcpu_chunk_size) {
+               /* is it in the reserved area? */
+               if (addr < first_start + pcpu_reserved_chunk_limit)
                         return pcpu_reserved_chunk;
+               return pcpu_first_chunk;
         }
  
-       /* nah... search the regular ones */
-       n = *pcpu_chunk_rb_search(addr, &parent);
-       if (!n) {
-               /* no exactly matching chunk, the parent is the closest */
-               n = parent;
-               BUG_ON(!n);
-       }
-       chunk = rb_entry(n, struct pcpu_chunk, rb_node);
-
-       if (addr < chunk->vm->addr) {
-               /* the parent was the next one, look for the previous one */
-               n = rb_prev(n);
-               BUG_ON(!n);
-               chunk = rb_entry(n, struct pcpu_chunk, rb_node);
-       }
-
-       return chunk;
-}
-
-/**
- * pcpu_chunk_addr_insert - insert chunk into address rb tree
- * @new: chunk to insert
- *
- * Insert @new into address rb tree.
- *
- * CONTEXT:
- * pcpu_lock.
- */
-static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
-{
-       struct rb_node **p, *parent;
-
-       p = pcpu_chunk_rb_search(new->vm->addr, &parent);
-       BUG_ON(*p);
-       rb_link_node(&new->rb_node, parent, p);
-       rb_insert_color(&new->rb_node, &pcpu_addr_root);
+       return pcpu_get_page_chunk(vmalloc_to_page(addr));
  }
  
  /**
@@ -755,6 +709,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
                                                   alloc_mask, 0);
                         if (!*pagep)
                                 goto err;
+                       pcpu_set_page_chunk(*pagep, chunk);
                 }
         }
  
@@ -879,7 +834,6 @@ restart:
  
         spin_lock_irq(&pcpu_lock);
         pcpu_chunk_relocate(chunk, -1);
-       pcpu_chunk_addr_insert(chunk);
         goto restart;
  
  area_found:
@@ -968,7 +922,6 @@ static void pcpu_reclaim(struct work_struct *work)
                 if (chunk == list_first_entry(head, struct pcpu_chunk, list))
                         continue;
  
-               rb_erase(&chunk->rb_node, &pcpu_addr_root);
                 list_move(&chunk->list, &todo);
         }
  
@@ -1147,7 +1100,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
  
         if (reserved_size) {
                 schunk->free_size = reserved_size;
-               pcpu_reserved_chunk = schunk;   /* not for dynamic alloc */
+               pcpu_reserved_chunk = schunk;
+               pcpu_reserved_chunk_limit = static_size + reserved_size;
         } else {
                 schunk->free_size = dyn_size;
                 dyn_size = 0;                   /* dynamic area covered */
@@ -1158,8 +1112,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
         if (schunk->free_size)
                 schunk->map[schunk->map_used++] = schunk->free_size;
  
-       pcpu_reserved_chunk_limit = static_size + schunk->free_size;
-
         /* init dynamic chunk if necessary */
         if (dyn_size) {
                 dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
@@ -1226,13 +1178,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
         }
  
         /* link the first chunk in */
-       if (!dchunk) {
-               pcpu_chunk_relocate(schunk, -1);
-               pcpu_chunk_addr_insert(schunk);
-       } else {
-               pcpu_chunk_relocate(dchunk, -1);
-               pcpu_chunk_addr_insert(dchunk);
-       }
+       pcpu_first_chunk = dchunk ?: schunk;
+       pcpu_chunk_relocate(pcpu_first_chunk, -1);
  
         /* we're done */
         pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
diff --git a/mm/slab.c b/mm/slab.c

index 9a90b00d2f9140e2aaa67ecb949e33d89431e9c5..f85831da9080d22e4abe0b2e2f2405cd067a0d85 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -102,7 +102,7 @@
  #include       <linux/cpu.h>
  #include       <linux/sysctl.h>
  #include       <linux/module.h>
-#include       <trace/kmemtrace.h>
+#include       <linux/kmemtrace.h>
  #include       <linux/rcupdate.h>
  #include       <linux/string.h>
  #include       <linux/uaccess.h>
diff --git a/mm/slob.c b/mm/slob.c

index f92e66d558bd3608f5c758d7b7c39748cf936f51..9b1737b0787bf53740fd538e697df549e140620b 100644 (file)
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -66,7 +66,7 @@
  #include <linux/module.h>
  #include <linux/rcupdate.h>
  #include <linux/list.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
  #include <asm/atomic.h>
  
  /*
diff --git a/mm/slub.c b/mm/slub.c

index 65ffda5934b09b8220e9a00332945dc19ba88de6..5e805a6fe36c46a200bc9d1ded7224bed5fce3ba 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -17,7 +17,7 @@
  #include <linux/slab.h>
  #include <linux/proc_fs.h>
  #include <linux/seq_file.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
  #include <linux/cpu.h>
  #include <linux/cpuset.h>
  #include <linux/mempolicy.h>
diff --git a/mm/util.c b/mm/util.c

index 55bef160b9f1484c032220b506f4ebfaeffe9a47..abc65aa7cdfc7bcfc76817afe2451cce7a4ece3e 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,9 +4,11 @@
  #include <linux/module.h>
  #include <linux/err.h>
  #include <linux/sched.h>
-#include <linux/tracepoint.h>
  #include <asm/uaccess.h>
  
+#define CREATE_TRACE_POINTS
+#include <trace/events/kmem.h>
+
  /**
   * kstrdup - allocate space for and copy an existing string
   * @s: the string to duplicate
@@ -255,13 +257,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
  EXPORT_SYMBOL_GPL(get_user_pages_fast);
  
  /* Tracepoints definitions. */
-DEFINE_TRACE(kmalloc);
-DEFINE_TRACE(kmem_cache_alloc);
-DEFINE_TRACE(kmalloc_node);
-DEFINE_TRACE(kmem_cache_alloc_node);
-DEFINE_TRACE(kfree);
-DEFINE_TRACE(kmem_cache_free);
-
  EXPORT_TRACEPOINT_SYMBOL(kmalloc);
  EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
  EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c

index 9fd0dc3cca99f5bad5af9fdbc189c528573fcc90..b75b6cea49dab47cbca9098a259ac44e4e51b4c8 100644 (file)
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -23,7 +23,7 @@
  #include <linux/bitops.h>
  #include <net/genetlink.h>
  
-#include <trace/skb.h>
+#include <trace/events/skb.h>
  
  #include <asm/unaligned.h>
  
diff --git a/net/core/net-traces.c b/net/core/net-traces.c

index c8fb45665e4f4af4cf6ced67fe90fff64f74e3ed..499a67eaf3ae201262c4a2e53f20870721b8f184 100644 (file)
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -19,11 +19,11 @@
  #include <linux/workqueue.h>
  #include <linux/netlink.h>
  #include <linux/net_dropmon.h>
-#include <trace/skb.h>
  
  #include <asm/unaligned.h>
  #include <asm/bitops.h>
  
+#define CREATE_TRACE_POINTS
+#include <trace/events/skb.h>
  
-DEFINE_TRACE(kfree_skb);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c

index e505b5392e1e511b278eda64e71914750ffc6577..c2e4fb8f3546c06a39b22b8c8538ecdf7187c3e7 100644 (file)
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -65,7 +65,7 @@
  
  #include <asm/uaccess.h>
  #include <asm/system.h>
-#include <trace/skb.h>
+#include <trace/events/skb.h>
  
  #include "kmap_skb.h"
  
diff --git a/samples/Kconfig b/samples/Kconfig

index 4b02f5a0e6560f4c886d413409d5507ffe6b0356..b75d28cba3f75fef74d90d01b2fb8990984a1c55 100644 (file)
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -19,6 +19,12 @@ config SAMPLE_TRACEPOINTS
         help
           This build tracepoints example modules.
  
+config SAMPLE_TRACE_EVENTS
+       tristate "Build trace_events examples -- loadable modules only"
+       depends on EVENT_TRACING && m
+       help
+         This build trace event example modules.
+
  config SAMPLE_KOBJECT
         tristate "Build kobject examples"
         help
diff --git a/samples/Makefile b/samples/Makefile

index 10eaca89fe17913875f90cf0354a9478978db018..13e4b470b5399b41a8f140b935bade02511c688e 100644 (file)
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -1,3 +1,3 @@
  # Makefile for Linux samples code
  
-obj-$(CONFIG_SAMPLES)  += markers/ kobject/ kprobes/ tracepoints/
+obj-$(CONFIG_SAMPLES)  += markers/ kobject/ kprobes/ tracepoints/ trace_events/
diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile

new file mode 100644 (file)

index 0000000..0d428dc
--- /dev/null
+++ b/samples/trace_events/Makefile
@@ -0,0 +1,6 @@
+# builds the trace events example kernel modules;
+# then to use one (as root):  insmod <module_name.ko>
+
+CFLAGS_trace-events-sample.o := -I$(src)
+
+obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c

new file mode 100644 (file)

index 0000000..aabc4e9
--- /dev/null
+++ b/samples/trace_events/trace-events-sample.c
@@ -0,0 +1,52 @@
+#include <linux/module.h>
+#include <linux/kthread.h>
+
+/*
+ * Any file that uses trace points, must include the header.
+ * But only one file, must include the header by defining
+ * CREATE_TRACE_POINTS first.  This will make the C code that
+ * creates the handles for the trace points.
+ */
+#define CREATE_TRACE_POINTS
+#include "trace-events-sample.h"
+
+
+static void simple_thread_func(int cnt)
+{
+       set_current_state(TASK_INTERRUPTIBLE);
+       schedule_timeout(HZ);
+       trace_foo_bar("hello", cnt);
+}
+
+static int simple_thread(void *arg)
+{
+       int cnt = 0;
+
+       while (!kthread_should_stop())
+               simple_thread_func(cnt++);
+
+       return 0;
+}
+
+static struct task_struct *simple_tsk;
+
+static int __init trace_event_init(void)
+{
+       simple_tsk = kthread_run(simple_thread, NULL, "event-sample");
+       if (IS_ERR(simple_tsk))
+               return -1;
+
+       return 0;
+}
+
+static void __exit trace_event_exit(void)
+{
+       kthread_stop(simple_tsk);
+}
+
+module_init(trace_event_init);
+module_exit(trace_event_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("trace-events-sample");
+MODULE_LICENSE("GPL");
diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h

new file mode 100644 (file)

index 0000000..128a897
--- /dev/null
+++ b/samples/trace_events/trace-events-sample.h
@@ -0,0 +1,129 @@
+/*
+ * Notice that this file is not protected like a normal header.
+ * We also must allow for rereading of this file. The
+ *
+ *  || defined(TRACE_HEADER_MULTI_READ)
+ *
+ * serves this purpose.
+ */
+#if !defined(_TRACE_EVENT_SAMPLE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_EVENT_SAMPLE_H
+
+/*
+ * All trace headers should include tracepoint.h, until we finally
+ * make it into a standard header.
+ */
+#include <linux/tracepoint.h>
+
+/*
+ * If TRACE_SYSTEM is defined, that will be the directory created
+ * in the ftrace directory under /debugfs/tracing/events/<system>
+ *
+ * The define_trace.h belowe will also look for a file name of
+ * TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here.
+ *
+ * If you want a different system than file name, you can override
+ * the header name by defining TRACE_INCLUDE_FILE
+ *
+ * If this file was called, goofy.h, then we would define:
+ *
+ * #define TRACE_INCLUDE_FILE goofy
+ *
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM sample
+
+/*
+ * The TRACE_EVENT macro is broken up into 5 parts.
+ *
+ * name: name of the trace point. This is also how to enable the tracepoint.
+ *   A function called trace_foo_bar() will be created.
+ *
+ * proto: the prototype of the function trace_foo_bar()
+ *   Here it is trace_foo_bar(char *foo, int bar).
+ *
+ * args:  must match the arguments in the prototype.
+ *    Here it is simply "foo, bar".
+ *
+ * struct:  This defines the way the data will be stored in the ring buffer.
+ *    There are currently two types of elements. __field and __array.
+ *    a __field is broken up into (type, name). Where type can be any
+ *    type but an array.
+ *    For an array. there are three fields. (type, name, size). The
+ *    type of elements in the array, the name of the field and the size
+ *    of the array.
+ *
+ *    __array( char, foo, 10) is the same as saying   char foo[10].
+ *
+ * fast_assign: This is a C like function that is used to store the items
+ *    into the ring buffer.
+ *
+ * printk: This is a way to print out the data in pretty print. This is
+ *    useful if the system crashes and you are logging via a serial line,
+ *    the data can be printed to the console using this "printk" method.
+ *
+ * Note, that for both the assign and the printk, __entry is the handler
+ * to the data structure in the ring buffer, and is defined by the
+ * TP_STRUCT__entry.
+ */
+TRACE_EVENT(foo_bar,
+
+       TP_PROTO(char *foo, int bar),
+
+       TP_ARGS(foo, bar),
+
+       TP_STRUCT__entry(
+               __array(        char,   foo,    10              )
+               __field(        int,    bar                     )
+       ),
+
+       TP_fast_assign(
+               strncpy(__entry->foo, foo, 10);
+               __entry->bar    = bar;
+       ),
+
+       TP_printk("foo %s %d", __entry->foo, __entry->bar)
+);
+#endif
+
+/***** NOTICE! The #if protection ends here. *****/
+
+
+/*
+ * There are several ways I could have done this. If I left out the
+ * TRACE_INCLUDE_PATH, then it would default to the kernel source
+ * include/trace/events directory.
+ *
+ * I could specify a path from the define_trace.h file back to this
+ * file.
+ *
+ * #define TRACE_INCLUDE_PATH ../../samples/trace_events
+ *
+ * But I chose to simply make it use the current directory and then in
+ * the Makefile I added:
+ *
+ * CFLAGS_trace-events-sample.o := -I$(PWD)/samples/trace_events/
+ *
+ * This will make sure the current path is part of the include
+ * structure for our file so that we can find it.
+ *
+ * I could have made only the top level directory the include:
+ *
+ * CFLAGS_trace-events-sample.o := -I$(PWD)
+ *
+ * And then let the path to this directory be the TRACE_INCLUDE_PATH:
+ *
+ * #define TRACE_INCLUDE_PATH samples/trace_events
+ *
+ * But then if something defines "samples" or "trace_events" then we
+ * could risk that being converted too, and give us an unexpected
+ * result.
+ */
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+/*
+ * TRACE_INCLUDE_FILE is not needed if the filename and TRACE_SYSTEM are equal
+ */
+#define TRACE_INCLUDE_FILE trace-events-sample
+#include <trace/define_trace.h>
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib

index cba61ca403cacb644bf07fed6e11f340f90c8f50..2b706617c89a806c69b83a112f046f16046904b4 100644 (file)
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -188,20 +188,34 @@ cmd_objcopy = $(OBJCOPY) $(OBJCOPYFLAGS) $(OBJCOPYFLAGS_$(@F)) $< $@
  # ---------------------------------------------------------------------------
  
  quiet_cmd_gzip = GZIP    $@
-cmd_gzip = gzip -f -9 < $< > $@
+cmd_gzip = (cat $(filter-out FORCE,$^) | gzip -f -9 > $@) || \
+       (rm -f $@ ; false)
  
  
  # Bzip2
  # ---------------------------------------------------------------------------
  
-# Bzip2 does not include size in file... so we have to fake that
-size_append=$(CONFIG_SHELL) $(srctree)/scripts/bin_size
-
-quiet_cmd_bzip2 = BZIP2    $@
-cmd_bzip2 = (bzip2 -9 < $< && $(size_append) $<) > $@ || (rm -f $@ ; false)
+# Bzip2 and LZMA do not include size in file... so we have to fake that;
+# append the size as a 32-bit littleendian number as gzip does.
+size_append = echo -ne $(shell                                         \
+dec_size=0;                                                            \
+for F in $1; do                                                                \
+       fsize=$$(stat -c "%s" $$F);                                     \
+       dec_size=$$(expr $$dec_size + $$fsize);                         \
+done;                                                                  \
+printf "%08x" $$dec_size |                                             \
+       sed 's/\(..\)\(..\)\(..\)\(..\)/\\\\x\4\\\\x\3\\\\x\2\\\\x\1/g' \
+)
+
+quiet_cmd_bzip2 = BZIP2   $@
+cmd_bzip2 = (cat $(filter-out FORCE,$^) | \
+       bzip2 -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
+       (rm -f $@ ; false)
  
  # Lzma
  # ---------------------------------------------------------------------------
  
  quiet_cmd_lzma = LZMA    $@
-cmd_lzma = (lzma -9 -c $< && $(size_append) $<) >$@ || (rm -f $@ ; false)
+cmd_lzma = (cat $(filter-out FORCE,$^) | \
+       lzma -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
+       (rm -f $@ ; false)
diff --git a/scripts/bin_size b/scripts/bin_size

deleted file mode 100644 (file)

index 43e1b36..0000000
--- a/scripts/bin_size
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/sh
-
-if [ $# = 0 ] ; then
-   echo Usage: $0 file
-fi
-
-size_dec=`stat -c "%s" $1`
-size_hex_echo_string=`printf "%08x" $size_dec |
-     sed 's/\(..\)\(..\)\(..\)\(..\)/\\\\x\4\\\\x\3\\\\x\2\\\\x\1/g'`
-/bin/echo -ne $size_hex_echo_string
diff --git a/scripts/kernel-doc b/scripts/kernel-doc

index 3208a3a7e7fe5e3f9ad07a20a5bb9827d099d5ae..acd8c4a8e3e0b00f24eaf3331b70156becbecf60 100755 (executable)
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -1828,6 +1828,25 @@ sub reset_state {
      $state = 0;
  }
  
+sub tracepoint_munge($) {
+       my $file = shift;
+       my $tracepointname = 0;
+       my $tracepointargs = 0;
+
+       if($prototype =~ m/TRACE_EVENT\((.*?),/) {
+               $tracepointname = $1;
+       }
+       if($prototype =~ m/TP_PROTO\((.*?)\)/) {
+               $tracepointargs = $1;
+       }
+       if (($tracepointname eq 0) || ($tracepointargs eq 0)) {
+               print STDERR "Warning(${file}:$.): Unrecognized tracepoint format: \n".
+                            "$prototype\n";
+       } else {
+               $prototype = "static inline void trace_$tracepointname($tracepointargs)";
+       }
+}
+
  sub syscall_munge() {
         my $void = 0;
  
@@ -1882,6 +1901,9 @@ sub process_state3_function($$) {
         if ($prototype =~ /SYSCALL_DEFINE/) {
                 syscall_munge();
         }
+       if ($prototype =~ /TRACE_EVENT/) {
+               tracepoint_munge($file);
+       }
         dump_function($prototype, $file);
         reset_state();
      }
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl

index 409596eca124f7ada601417a59050eb12f55210a..0fae7da0529cac497a5337be4b6dd0e89cbf13d6 100755 (executable)
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -26,7 +26,7 @@
  # which will also be the location of that section after final link.
  # e.g.
  #
-#  .section ".text.sched"
+#  .section ".sched.text", "ax"
  #  .globl my_func
  #  my_func:
  #        [...]
@@ -39,7 +39,7 @@
  #        [...]
  #
  # Both relocation offsets for the mcounts in the above example will be
-# offset from .text.sched. If we make another file called tmp.s with:
+# offset from .sched.text. If we make another file called tmp.s with:
  #
  #  .section __mcount_loc
  #  .quad  my_func + 0x5
@@ -51,7 +51,7 @@
  # But this gets hard if my_func is not globl (a static function).
  # In such a case we have:
  #
-#  .section ".text.sched"
+#  .section ".sched.text", "ax"
  #  my_func:
  #        [...]
  #        call mcount  (offset: 0x5)
diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c

index ffbe259700b10b54852cc5d4b29ae91a754230e4..510186f0b72e4fa3070bb473c288d6ae8ed4b53e 100644 (file)
--- a/security/integrity/ima/ima_fs.c
+++ b/security/integrity/ima/ima_fs.c
@@ -84,8 +84,8 @@ static void *ima_measurements_next(struct seq_file *m, void *v, loff_t *pos)
          * against concurrent list-extension
          */
         rcu_read_lock();
-       qe = list_entry(rcu_dereference(qe->later.next),
-                       struct ima_queue_entry, later);
+       qe = list_entry_rcu(qe->later.next,
+                           struct ima_queue_entry, later);
         rcu_read_unlock();
         (*pos)++;
  
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c

index e03a7e19c73b3a0b49d7533db45ecb719d250f30..11d2cb19d7a6b39570d1df03c06a349a3b296525 100644 (file)
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -734,8 +734,8 @@ static void smk_netlbladdr_insert(struct smk_netlbladdr *new)
                 return;
         }
  
-       m = list_entry(rcu_dereference(smk_netlbladdr_list.next),
-                        struct smk_netlbladdr, list);
+       m = list_entry_rcu(smk_netlbladdr_list.next,
+                          struct smk_netlbladdr, list);
  
         /* the comparison '>' is a bit hacky, but works */
         if (new->smk_mask.s_addr > m->smk_mask.s_addr) {
@@ -748,8 +748,8 @@ static void smk_netlbladdr_insert(struct smk_netlbladdr *new)
                         list_add_rcu(&new->list, &m->list);
                         return;
                 }
-               m_next = list_entry(rcu_dereference(m->list.next),
-                                struct smk_netlbladdr, list);
+               m_next = list_entry_rcu(m->list.next,
+                                       struct smk_netlbladdr, list);
                 if (new->smk_mask.s_addr > m_next->smk_mask.s_addr) {
                         list_add_rcu(&new->list, &m->list);
                         return;
diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore

new file mode 100644 (file)

index 0000000..d69a759
--- /dev/null
+++ b/tools/perf/.gitignore
@@ -0,0 +1,16 @@
+PERF-BUILD-OPTIONS
+PERF-CFLAGS
+PERF-GUI-VARS
+PERF-VERSION-FILE
+perf
+perf-help
+perf-record
+perf-report
+perf-stat
+perf-top
+perf*.1
+perf*.xml
+common-cmds.h
+tags
+TAGS
+cscope*
diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile

new file mode 100644 (file)

index 0000000..5457192
--- /dev/null
+++ b/tools/perf/Documentation/Makefile
@@ -0,0 +1,300 @@
+MAN1_TXT= \
+       $(filter-out $(addsuffix .txt, $(ARTICLES) $(SP_ARTICLES)), \
+               $(wildcard perf-*.txt)) \
+       perf.txt
+MAN5_TXT=
+MAN7_TXT=
+
+MAN_TXT = $(MAN1_TXT) $(MAN5_TXT) $(MAN7_TXT)
+MAN_XML=$(patsubst %.txt,%.xml,$(MAN_TXT))
+MAN_HTML=$(patsubst %.txt,%.html,$(MAN_TXT))
+
+DOC_HTML=$(MAN_HTML)
+
+ARTICLES =
+# with their own formatting rules.
+SP_ARTICLES =
+API_DOCS = $(patsubst %.txt,%,$(filter-out technical/api-index-skel.txt technical/api-index.txt, $(wildcard technical/api-*.txt)))
+SP_ARTICLES += $(API_DOCS)
+SP_ARTICLES += technical/api-index
+
+DOC_HTML += $(patsubst %,%.html,$(ARTICLES) $(SP_ARTICLES))
+
+DOC_MAN1=$(patsubst %.txt,%.1,$(MAN1_TXT))
+DOC_MAN5=$(patsubst %.txt,%.5,$(MAN5_TXT))
+DOC_MAN7=$(patsubst %.txt,%.7,$(MAN7_TXT))
+
+prefix?=$(HOME)
+bindir?=$(prefix)/bin
+htmldir?=$(prefix)/share/doc/perf-doc
+pdfdir?=$(prefix)/share/doc/perf-doc
+mandir?=$(prefix)/share/man
+man1dir=$(mandir)/man1
+man5dir=$(mandir)/man5
+man7dir=$(mandir)/man7
+# DESTDIR=
+
+ASCIIDOC=asciidoc
+ASCIIDOC_EXTRA =
+MANPAGE_XSL = manpage-normal.xsl
+XMLTO_EXTRA =
+INSTALL?=install
+RM ?= rm -f
+DOC_REF = origin/man
+HTML_REF = origin/html
+
+infodir?=$(prefix)/share/info
+MAKEINFO=makeinfo
+INSTALL_INFO=install-info
+DOCBOOK2X_TEXI=docbook2x-texi
+DBLATEX=dblatex
+ifndef PERL_PATH
+       PERL_PATH = /usr/bin/perl
+endif
+
+-include ../config.mak.autogen
+-include ../config.mak
+
+#
+# For asciidoc ...
+#      -7.1.2, no extra settings are needed.
+#      8.0-,   set ASCIIDOC8.
+#
+
+#
+# For docbook-xsl ...
+#      -1.68.1,        set ASCIIDOC_NO_ROFF? (based on changelog from 1.73.0)
+#      1.69.0,         no extra settings are needed?
+#      1.69.1-1.71.0,  set DOCBOOK_SUPPRESS_SP?
+#      1.71.1,         no extra settings are needed?
+#      1.72.0,         set DOCBOOK_XSL_172.
+#      1.73.0-,        set ASCIIDOC_NO_ROFF
+#
+
+#
+# If you had been using DOCBOOK_XSL_172 in an attempt to get rid
+# of 'the ".ft C" problem' in your generated manpages, and you
+# instead ended up with weird characters around callouts, try
+# using ASCIIDOC_NO_ROFF instead (it works fine with ASCIIDOC8).
+#
+
+ifdef ASCIIDOC8
+ASCIIDOC_EXTRA += -a asciidoc7compatible
+endif
+ifdef DOCBOOK_XSL_172
+ASCIIDOC_EXTRA += -a perf-asciidoc-no-roff
+MANPAGE_XSL = manpage-1.72.xsl
+else
+       ifdef ASCIIDOC_NO_ROFF
+       # docbook-xsl after 1.72 needs the regular XSL, but will not
+       # pass-thru raw roff codes from asciidoc.conf, so turn them off.
+       ASCIIDOC_EXTRA += -a perf-asciidoc-no-roff
+       endif
+endif
+ifdef MAN_BOLD_LITERAL
+XMLTO_EXTRA += -m manpage-bold-literal.xsl
+endif
+ifdef DOCBOOK_SUPPRESS_SP
+XMLTO_EXTRA += -m manpage-suppress-sp.xsl
+endif
+
+SHELL_PATH ?= $(SHELL)
+# Shell quote;
+SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
+
+#
+# Please note that there is a minor bug in asciidoc.
+# The version after 6.0.3 _will_ include the patch found here:
+#   http://marc.theaimsgroup.com/?l=perf&m=111558757202243&w=2
+#
+# Until that version is released you may have to apply the patch
+# yourself - yes, all 6 characters of it!
+#
+
+QUIET_SUBDIR0  = +$(MAKE) -C # space to separate -C and subdir
+QUIET_SUBDIR1  =
+
+ifneq ($(findstring $(MAKEFLAGS),w),w)
+PRINT_DIR = --no-print-directory
+else # "make -w"
+NO_SUBDIR = :
+endif
+
+ifneq ($(findstring $(MAKEFLAGS),s),s)
+ifndef V
+       QUIET_ASCIIDOC  = @echo '   ' ASCIIDOC $@;
+       QUIET_XMLTO     = @echo '   ' XMLTO $@;
+       QUIET_DB2TEXI   = @echo '   ' DB2TEXI $@;
+       QUIET_MAKEINFO  = @echo '   ' MAKEINFO $@;
+       QUIET_DBLATEX   = @echo '   ' DBLATEX $@;
+       QUIET_XSLTPROC  = @echo '   ' XSLTPROC $@;
+       QUIET_GEN       = @echo '   ' GEN $@;
+       QUIET_STDERR    = 2> /dev/null
+       QUIET_SUBDIR0   = +@subdir=
+       QUIET_SUBDIR1   = ;$(NO_SUBDIR) echo '   ' SUBDIR $$subdir; \
+                         $(MAKE) $(PRINT_DIR) -C $$subdir
+       export V
+endif
+endif
+
+all: html man
+
+html: $(DOC_HTML)
+
+$(DOC_HTML) $(DOC_MAN1) $(DOC_MAN5) $(DOC_MAN7): asciidoc.conf
+
+man: man1 man5 man7
+man1: $(DOC_MAN1)
+man5: $(DOC_MAN5)
+man7: $(DOC_MAN7)
+
+info: perf.info perfman.info
+
+pdf: user-manual.pdf
+
+install: install-man
+
+install-man: man
+       $(INSTALL) -d -m 755 $(DESTDIR)$(man1dir)
+#      $(INSTALL) -d -m 755 $(DESTDIR)$(man5dir)
+#      $(INSTALL) -d -m 755 $(DESTDIR)$(man7dir)
+       $(INSTALL) -m 644 $(DOC_MAN1) $(DESTDIR)$(man1dir)
+#      $(INSTALL) -m 644 $(DOC_MAN5) $(DESTDIR)$(man5dir)
+#      $(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir)
+
+install-info: info
+       $(INSTALL) -d -m 755 $(DESTDIR)$(infodir)
+       $(INSTALL) -m 644 perf.info perfman.info $(DESTDIR)$(infodir)
+       if test -r $(DESTDIR)$(infodir)/dir; then \
+         $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perf.info ;\
+         $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perfman.info ;\
+       else \
+         echo "No directory found in $(DESTDIR)$(infodir)" >&2 ; \
+       fi
+
+install-pdf: pdf
+       $(INSTALL) -d -m 755 $(DESTDIR)$(pdfdir)
+       $(INSTALL) -m 644 user-manual.pdf $(DESTDIR)$(pdfdir)
+
+install-html: html
+       '$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir)
+
+../PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
+       $(QUIET_SUBDIR0)../ $(QUIET_SUBDIR1) PERF-VERSION-FILE
+
+-include ../PERF-VERSION-FILE
+
+#
+# Determine "include::" file references in asciidoc files.
+#
+doc.dep : $(wildcard *.txt) build-docdep.perl
+       $(QUIET_GEN)$(RM) $@+ $@ && \
+       $(PERL_PATH) ./build-docdep.perl >$@+ $(QUIET_STDERR) && \
+       mv $@+ $@
+
+-include doc.dep
+
+cmds_txt = cmds-ancillaryinterrogators.txt \
+       cmds-ancillarymanipulators.txt \
+       cmds-mainporcelain.txt \
+       cmds-plumbinginterrogators.txt \
+       cmds-plumbingmanipulators.txt \
+       cmds-synchingrepositories.txt \
+       cmds-synchelpers.txt \
+       cmds-purehelpers.txt \
+       cmds-foreignscminterface.txt
+
+$(cmds_txt): cmd-list.made
+
+cmd-list.made: cmd-list.perl ../command-list.txt $(MAN1_TXT)
+       $(QUIET_GEN)$(RM) $@ && \
+       $(PERL_PATH) ./cmd-list.perl ../command-list.txt $(QUIET_STDERR) && \
+       date >$@
+
+clean:
+       $(RM) *.xml *.xml+ *.html *.html+ *.1 *.5 *.7
+       $(RM) *.texi *.texi+ *.texi++ perf.info perfman.info
+       $(RM) howto-index.txt howto/*.html doc.dep
+       $(RM) technical/api-*.html technical/api-index.txt
+       $(RM) $(cmds_txt) *.made
+
+$(MAN_HTML): %.html : %.txt
+       $(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
+       $(ASCIIDOC) -b xhtml11 -d manpage -f asciidoc.conf \
+               $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
+       mv $@+ $@
+
+%.1 %.5 %.7 : %.xml
+       $(QUIET_XMLTO)$(RM) $@ && \
+       xmlto -m $(MANPAGE_XSL) $(XMLTO_EXTRA) man $<
+
+%.xml : %.txt
+       $(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
+       $(ASCIIDOC) -b docbook -d manpage -f asciidoc.conf \
+               $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
+       mv $@+ $@
+
+XSLT = docbook.xsl
+XSLTOPTS = --xinclude --stringparam html.stylesheet docbook-xsl.css
+
+user-manual.html: user-manual.xml
+       $(QUIET_XSLTPROC)xsltproc $(XSLTOPTS) -o $@ $(XSLT) $<
+
+perf.info: user-manual.texi
+       $(QUIET_MAKEINFO)$(MAKEINFO) --no-split -o $@ user-manual.texi
+
+user-manual.texi: user-manual.xml
+       $(QUIET_DB2TEXI)$(RM) $@+ $@ && \
+       $(DOCBOOK2X_TEXI) user-manual.xml --encoding=UTF-8 --to-stdout >$@++ && \
+       $(PERL_PATH) fix-texi.perl <$@++ >$@+ && \
+       rm $@++ && \
+       mv $@+ $@
+
+user-manual.pdf: user-manual.xml
+       $(QUIET_DBLATEX)$(RM) $@+ $@ && \
+       $(DBLATEX) -o $@+ -p /etc/asciidoc/dblatex/asciidoc-dblatex.xsl -s /etc/asciidoc/dblatex/asciidoc-dblatex.sty $< && \
+       mv $@+ $@
+
+perfman.texi: $(MAN_XML) cat-texi.perl
+       $(QUIET_DB2TEXI)$(RM) $@+ $@ && \
+       ($(foreach xml,$(MAN_XML),$(DOCBOOK2X_TEXI) --encoding=UTF-8 \
+               --to-stdout $(xml) &&) true) > $@++ && \
+       $(PERL_PATH) cat-texi.perl $@ <$@++ >$@+ && \
+       rm $@++ && \
+       mv $@+ $@
+
+perfman.info: perfman.texi
+       $(QUIET_MAKEINFO)$(MAKEINFO) --no-split --no-validate $*.texi
+
+$(patsubst %.txt,%.texi,$(MAN_TXT)): %.texi : %.xml
+       $(QUIET_DB2TEXI)$(RM) $@+ $@ && \
+       $(DOCBOOK2X_TEXI) --to-stdout $*.xml >$@+ && \
+       mv $@+ $@
+
+howto-index.txt: howto-index.sh $(wildcard howto/*.txt)
+       $(QUIET_GEN)$(RM) $@+ $@ && \
+       '$(SHELL_PATH_SQ)' ./howto-index.sh $(wildcard howto/*.txt) >$@+ && \
+       mv $@+ $@
+
+$(patsubst %,%.html,$(ARTICLES)) : %.html : %.txt
+       $(QUIET_ASCIIDOC)$(ASCIIDOC) -b xhtml11 $*.txt
+
+WEBDOC_DEST = /pub/software/tools/perf/docs
+
+$(patsubst %.txt,%.html,$(wildcard howto/*.txt)): %.html : %.txt
+       $(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
+       sed -e '1,/^$$/d' $< | $(ASCIIDOC) -b xhtml11 - >$@+ && \
+       mv $@+ $@
+
+install-webdoc : html
+       '$(SHELL_PATH_SQ)' ./install-webdoc.sh $(WEBDOC_DEST)
+
+quick-install: quick-install-man
+
+quick-install-man:
+       '$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(DOC_REF) $(DESTDIR)$(mandir)
+
+quick-install-html:
+       '$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(HTML_REF) $(DESTDIR)$(htmldir)
+
+.PHONY: .FORCE-PERF-VERSION-FILE
diff --git a/tools/perf/Documentation/asciidoc.conf b/tools/perf/Documentation/asciidoc.conf

new file mode 100644 (file)

index 0000000..356b23a
--- /dev/null
+++ b/tools/perf/Documentation/asciidoc.conf
@@ -0,0 +1,91 @@
+## linkperf: macro
+#
+# Usage: linkperf:command[manpage-section]
+#
+# Note, {0} is the manpage section, while {target} is the command.
+#
+# Show PERF link as: <command>(<section>); if section is defined, else just show
+# the command.
+
+[macros]
+(?su)[\\]?(?P<name>linkperf):(?P<target>\S*?)\[(?P<attrlist>.*?)\]=
+
+[attributes]
+asterisk=&#42;
+plus=&#43;
+caret=&#94;
+startsb=&#91;
+endsb=&#93;
+tilde=&#126;
+
+ifdef::backend-docbook[]
+[linkperf-inlinemacro]
+{0%{target}}
+{0#<citerefentry>}
+{0#<refentrytitle>{target}</refentrytitle><manvolnum>{0}</manvolnum>}
+{0#</citerefentry>}
+endif::backend-docbook[]
+
+ifdef::backend-docbook[]
+ifndef::perf-asciidoc-no-roff[]
+# "unbreak" docbook-xsl v1.68 for manpages. v1.69 works with or without this.
+# v1.72 breaks with this because it replaces dots not in roff requests.
+[listingblock]
+<example><title>{title}</title>
+<literallayout>
+ifdef::doctype-manpage[]
+&#10;.ft C&#10;
+endif::doctype-manpage[]
+|
+ifdef::doctype-manpage[]
+&#10;.ft&#10;
+endif::doctype-manpage[]
+</literallayout>
+{title#}</example>
+endif::perf-asciidoc-no-roff[]
+
+ifdef::perf-asciidoc-no-roff[]
+ifdef::doctype-manpage[]
+# The following two small workarounds insert a simple paragraph after screen
+[listingblock]
+<example><title>{title}</title>
+<literallayout>
+|
+</literallayout><simpara></simpara>
+{title#}</example>
+
+[verseblock]
+<formalpara{id? id="{id}"}><title>{title}</title><para>
+{title%}<literallayout{id? id="{id}"}>
+{title#}<literallayout>
+|
+</literallayout>
+{title#}</para></formalpara>
+{title%}<simpara></simpara>
+endif::doctype-manpage[]
+endif::perf-asciidoc-no-roff[]
+endif::backend-docbook[]
+
+ifdef::doctype-manpage[]
+ifdef::backend-docbook[]
+[header]
+template::[header-declarations]
+<refentry>
+<refmeta>
+<refentrytitle>{mantitle}</refentrytitle>
+<manvolnum>{manvolnum}</manvolnum>
+<refmiscinfo class="source">perf</refmiscinfo>
+<refmiscinfo class="version">{perf_version}</refmiscinfo>
+<refmiscinfo class="manual">perf Manual</refmiscinfo>
+</refmeta>
+<refnamediv>
+  <refname>{manname}</refname>
+  <refpurpose>{manpurpose}</refpurpose>
+</refnamediv>
+endif::backend-docbook[]
+endif::doctype-manpage[]
+
+ifdef::backend-xhtml11[]
+[linkperf-inlinemacro]
+<a href="{target}.html">{target}{0?({0})}</a>
+endif::backend-xhtml11[]
diff --git a/tools/perf/Documentation/manpage-1.72.xsl b/tools/perf/Documentation/manpage-1.72.xsl

new file mode 100644 (file)

index 0000000..b4d315c
--- /dev/null
+++ b/tools/perf/Documentation/manpage-1.72.xsl
@@ -0,0 +1,14 @@
+<!-- manpage-1.72.xsl:
+     special settings for manpages rendered from asciidoc+docbook
+     handles peculiarities in docbook-xsl 1.72.0 -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+               version="1.0">
+
+<xsl:import href="manpage-base.xsl"/>
+
+<!-- these are the special values for the roff control characters
+     needed for docbook-xsl 1.72.0 -->
+<xsl:param name="git.docbook.backslash">&#x2593;</xsl:param>
+<xsl:param name="git.docbook.dot"      >&#x2302;</xsl:param>
+
+</xsl:stylesheet>
diff --git a/tools/perf/Documentation/manpage-base.xsl b/tools/perf/Documentation/manpage-base.xsl

new file mode 100644 (file)

index 0000000..a264fa6
--- /dev/null
+++ b/tools/perf/Documentation/manpage-base.xsl
@@ -0,0 +1,35 @@
+<!-- manpage-base.xsl:
+     special formatting for manpages rendered from asciidoc+docbook -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+               version="1.0">
+
+<!-- these params silence some output from xmlto -->
+<xsl:param name="man.output.quietly" select="1"/>
+<xsl:param name="refentry.meta.get.quietly" select="1"/>
+
+<!-- convert asciidoc callouts to man page format;
+     git.docbook.backslash and git.docbook.dot params
+     must be supplied by another XSL file or other means -->
+<xsl:template match="co">
+       <xsl:value-of select="concat(
+                             $git.docbook.backslash,'fB(',
+                             substring-after(@id,'-'),')',
+                             $git.docbook.backslash,'fR')"/>
+</xsl:template>
+<xsl:template match="calloutlist">
+       <xsl:value-of select="$git.docbook.dot"/>
+       <xsl:text>sp&#10;</xsl:text>
+       <xsl:apply-templates/>
+       <xsl:text>&#10;</xsl:text>
+</xsl:template>
+<xsl:template match="callout">
+       <xsl:value-of select="concat(
+                             $git.docbook.backslash,'fB',
+                             substring-after(@arearefs,'-'),
+                             '. ',$git.docbook.backslash,'fR')"/>
+       <xsl:apply-templates/>
+       <xsl:value-of select="$git.docbook.dot"/>
+       <xsl:text>br&#10;</xsl:text>
+</xsl:template>
+
+</xsl:stylesheet>
diff --git a/tools/perf/Documentation/manpage-bold-literal.xsl b/tools/perf/Documentation/manpage-bold-literal.xsl

new file mode 100644 (file)

index 0000000..608eb5d
--- /dev/null
+++ b/tools/perf/Documentation/manpage-bold-literal.xsl
@@ -0,0 +1,17 @@
+<!-- manpage-bold-literal.xsl:
+     special formatting for manpages rendered from asciidoc+docbook -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+               version="1.0">
+
+<!-- render literal text as bold (instead of plain or monospace);
+     this makes literal text easier to distinguish in manpages
+     viewed on a tty -->
+<xsl:template match="literal">
+       <xsl:value-of select="$git.docbook.backslash"/>
+       <xsl:text>fB</xsl:text>
+       <xsl:apply-templates/>
+       <xsl:value-of select="$git.docbook.backslash"/>
+       <xsl:text>fR</xsl:text>
+</xsl:template>
+
+</xsl:stylesheet>
diff --git a/tools/perf/Documentation/manpage-normal.xsl b/tools/perf/Documentation/manpage-normal.xsl

new file mode 100644 (file)

index 0000000..a48f5b1
--- /dev/null
+++ b/tools/perf/Documentation/manpage-normal.xsl
@@ -0,0 +1,13 @@
+<!-- manpage-normal.xsl:
+     special settings for manpages rendered from asciidoc+docbook
+     handles anything we want to keep away from docbook-xsl 1.72.0 -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+               version="1.0">
+
+<xsl:import href="manpage-base.xsl"/>
+
+<!-- these are the normal values for the roff control characters -->
+<xsl:param name="git.docbook.backslash">\</xsl:param>
+<xsl:param name="git.docbook.dot"      >.</xsl:param>
+
+</xsl:stylesheet>
diff --git a/tools/perf/Documentation/manpage-suppress-sp.xsl b/tools/perf/Documentation/manpage-suppress-sp.xsl

new file mode 100644 (file)

index 0000000..a63c763
--- /dev/null
+++ b/tools/perf/Documentation/manpage-suppress-sp.xsl
@@ -0,0 +1,21 @@
+<!-- manpage-suppress-sp.xsl:
+     special settings for manpages rendered from asciidoc+docbook
+     handles erroneous, inline .sp in manpage output of some
+     versions of docbook-xsl -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+               version="1.0">
+
+<!-- attempt to work around spurious .sp at the tail of the line
+     that some versions of docbook stylesheets seem to add -->
+<xsl:template match="simpara">
+  <xsl:variable name="content">
+    <xsl:apply-templates/>
+  </xsl:variable>
+  <xsl:value-of select="normalize-space($content)"/>
+  <xsl:if test="not(ancestor::authorblurb) and
+                not(ancestor::personblurb)">
+    <xsl:text>&#10;&#10;</xsl:text>
+  </xsl:if>
+</xsl:template>
+
+</xsl:stylesheet>
diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt

new file mode 100644 (file)

index 0000000..c9dcade
--- /dev/null
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -0,0 +1,29 @@
+perf-annotate(1)
+==============
+
+NAME
+----
+perf-annotate - Read perf.data (created by perf record) and display annotated code
+
+SYNOPSIS
+--------
+[verse]
+'perf annotate' [-i <file> | --input=file] symbol_name
+
+DESCRIPTION
+-----------
+This command reads the input file and displays an annotated version of the
+code. If the object file has debug symbols then the source code will be
+displayed alongside assembly code.
+
+If there is no debug info in the object, then annotated assembly is displayed.
+
+OPTIONS
+-------
+-i::
+--input=::
+        Input file name. (default: perf.data)
+
+SEE ALSO
+--------
+linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-help.txt b/tools/perf/Documentation/perf-help.txt

new file mode 100644 (file)

index 0000000..5143918
--- /dev/null
+++ b/tools/perf/Documentation/perf-help.txt
@@ -0,0 +1,38 @@
+perf-help(1)
+============
+
+NAME
+----
+perf-help - display help information about perf
+
+SYNOPSIS
+--------
+'perf help' [-a|--all] [COMMAND]
+
+DESCRIPTION
+-----------
+
+With no options and no COMMAND given, the synopsis of the 'perf'
+command and a list of the most commonly used perf commands are printed
+on the standard output.
+
+If the option '--all' or '-a' is given, then all available commands are
+printed on the standard output.
+
+If a perf command is named, a manual page for that command is brought
+up. The 'man' program is used by default for this purpose, but this
+can be overridden by other options or configuration variables.
+
+Note that `perf --help ...` is identical to `perf help ...` because the
+former is internally converted into the latter.
+
+OPTIONS
+-------
+-a::
+--all::
+       Prints all the available commands on the standard output. This
+       option supersedes any other option.
+
+PERF
+----
+Part of the linkperf:perf[1] suite
diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt

new file mode 100644 (file)

index 0000000..8290b94
--- /dev/null
+++ b/tools/perf/Documentation/perf-list.txt
@@ -0,0 +1,25 @@
+perf-list(1)
+============
+
+NAME
+----
+perf-list - List all symbolic event types
+
+SYNOPSIS
+--------
+[verse]
+'perf list'
+
+DESCRIPTION
+-----------
+This command displays the symbolic event types which can be selected in the
+various perf commands with the -e option.
+
+OPTIONS
+-------
+None
+
+SEE ALSO
+--------
+linkperf:perf-stat[1], linkperf:perf-top[1],
+linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt

new file mode 100644 (file)

index 0000000..1dbc1ee
--- /dev/null
+++ b/tools/perf/Documentation/perf-record.txt
@@ -0,0 +1,42 @@
+perf-record(1)
+==============
+
+NAME
+----
+perf-record - Run a command and record its profile into perf.data
+
+SYNOPSIS
+--------
+[verse]
+'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
+'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] -- <command> [<options>]
+
+DESCRIPTION
+-----------
+This command runs a command and gathers a performance counter profile
+from it, into perf.data - without displaying anything.
+
+This file can then be inspected later on, using 'perf report'.
+
+
+OPTIONS
+-------
+<command>...::
+       Any command you can specify in a shell.
+
+-e::
+--event=::
+       Select the PMU event. Selection can be a symbolic event name
+       (use 'perf list' to list all events) or a raw PMU
+       event (eventsel+umask) in the form of rNNN where NNN is a
+        hexadecimal event descriptor.
+
+-a::
+        system-wide collection
+
+-l::
+        scale counter values
+
+SEE ALSO
+--------
+linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt

new file mode 100644 (file)

index 0000000..52d3fc6
--- /dev/null
+++ b/tools/perf/Documentation/perf-report.txt
@@ -0,0 +1,26 @@
+perf-report(1)
+==============
+
+NAME
+----
+perf-report - Read perf.data (created by perf record) and display the profile
+
+SYNOPSIS
+--------
+[verse]
+'perf report' [-i <file> | --input=file]
+
+DESCRIPTION
+-----------
+This command displays the performance counter profile information recorded
+via perf report.
+
+OPTIONS
+-------
+-i::
+--input=::
+        Input file name. (default: perf.data)
+
+SEE ALSO
+--------
+linkperf:perf-stat[1]
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt

new file mode 100644 (file)

index 0000000..c368a72
--- /dev/null
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -0,0 +1,66 @@
+perf-stat(1)
+============
+
+NAME
+----
+perf-stat - Run a command and gather performance counter statistics
+
+SYNOPSIS
+--------
+[verse]
+'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
+'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] -- <command> [<options>]
+
+DESCRIPTION
+-----------
+This command runs a command and gathers performance counter statistics
+from it.
+
+
+OPTIONS
+-------
+<command>...::
+       Any command you can specify in a shell.
+
+
+-e::
+--event=::
+       Select the PMU event. Selection can be a symbolic event name
+       (use 'perf list' to list all events) or a raw PMU
+       event (eventsel+umask) in the form of rNNN where NNN is a
+        hexadecimal event descriptor.
+
+-i::
+--inherit::
+        child tasks inherit counters
+-p::
+--pid=<pid>::
+        stat events on existing pid
+
+-a::
+        system-wide collection
+
+-l::
+        scale counter values
+
+EXAMPLES
+--------
+
+$ perf stat -- make -j
+
+ Performance counter stats for 'make -j':
+
+    8117.370256  task clock ticks     #      11.281 CPU utilization factor
+            678  context switches     #       0.000 M/sec
+            133  CPU migrations       #       0.000 M/sec
+         235724  pagefaults           #       0.029 M/sec
+    24821162526  CPU cycles           #    3057.784 M/sec
+    18687303457  instructions         #    2302.138 M/sec
+      172158895  cache references     #      21.209 M/sec
+       27075259  cache misses         #       3.335 M/sec
+
+ Wall-clock time elapsed:   719.554352 msecs
+
+SEE ALSO
+--------
+linkperf:perf-top[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt

new file mode 100644 (file)

index 0000000..539d012
--- /dev/null
+++ b/tools/perf/Documentation/perf-top.txt
@@ -0,0 +1,39 @@
+perf-top(1)
+===========
+
+NAME
+----
+perf-top - Run a command and profile it
+
+SYNOPSIS
+--------
+[verse]
+'perf top' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
+
+DESCRIPTION
+-----------
+This command runs a command and gathers a performance counter profile
+from it.
+
+
+OPTIONS
+-------
+<command>...::
+       Any command you can specify in a shell.
+
+-e::
+--event=::
+       Select the PMU event. Selection can be a symbolic event name
+       (use 'perf list' to list all events) or a raw PMU
+       event (eventsel+umask) in the form of rNNN where NNN is a
+        hexadecimal event descriptor.
+
+-a::
+        system-wide collection
+
+-l::
+        scale counter values
+
+SEE ALSO
+--------
+linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf.txt b/tools/perf/Documentation/perf.txt

new file mode 100644 (file)

index 0000000..69c8325
--- /dev/null
+++ b/tools/perf/Documentation/perf.txt
@@ -0,0 +1,24 @@
+perf(1)
+=======
+
+NAME
+----
+perf - Performance analysis tools for Linux
+
+SYNOPSIS
+--------
+[verse]
+'perf' [--version] [--help] COMMAND [ARGS]
+
+DESCRIPTION
+-----------
+Performance counters for Linux are are a new kernel-based subsystem
+that provide a framework for all things performance analysis. It
+covers hardware level (CPU/PMU, Performance Monitoring Unit) features
+and software features (software counters, tracepoints) as well.
+
+SEE ALSO
+--------
+linkperf:perf-stat[1], linkperf:perf-top[1],
+linkperf:perf-record[1], linkperf:perf-report[1],
+linkperf:perf-list[1]
diff --git a/tools/perf/Makefile b/tools/perf/Makefile

new file mode 100644 (file)

index 0000000..0cbd5d6
--- /dev/null
+++ b/tools/perf/Makefile
@@ -0,0 +1,929 @@
+# The default target of this Makefile is...
+all::
+
+# Define V=1 to have a more verbose compile.
+#
+# Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf()
+# or vsnprintf() return -1 instead of number of characters which would
+# have been written to the final string if enough space had been available.
+#
+# Define FREAD_READS_DIRECTORIES if your are on a system which succeeds
+# when attempting to read from an fopen'ed directory.
+#
+# Define NO_OPENSSL environment variable if you do not have OpenSSL.
+# This also implies MOZILLA_SHA1.
+#
+# Define CURLDIR=/foo/bar if your curl header and library files are in
+# /foo/bar/include and /foo/bar/lib directories.
+#
+# Define EXPATDIR=/foo/bar if your expat header and library files are in
+# /foo/bar/include and /foo/bar/lib directories.
+#
+# Define NO_D_INO_IN_DIRENT if you don't have d_ino in your struct dirent.
+#
+# Define NO_D_TYPE_IN_DIRENT if your platform defines DT_UNKNOWN but lacks
+# d_type in struct dirent (latest Cygwin -- will be fixed soonish).
+#
+# Define NO_C99_FORMAT if your formatted IO functions (printf/scanf et.al.)
+# do not support the 'size specifiers' introduced by C99, namely ll, hh,
+# j, z, t. (representing long long int, char, intmax_t, size_t, ptrdiff_t).
+# some C compilers supported these specifiers prior to C99 as an extension.
+#
+# Define NO_STRCASESTR if you don't have strcasestr.
+#
+# Define NO_MEMMEM if you don't have memmem.
+#
+# Define NO_STRTOUMAX if you don't have strtoumax in the C library.
+# If your compiler also does not support long long or does not have
+# strtoull, define NO_STRTOULL.
+#
+# Define NO_SETENV if you don't have setenv in the C library.
+#
+# Define NO_UNSETENV if you don't have unsetenv in the C library.
+#
+# Define NO_MKDTEMP if you don't have mkdtemp in the C library.
+#
+# Define NO_SYS_SELECT_H if you don't have sys/select.h.
+#
+# Define NO_SYMLINK_HEAD if you never want .perf/HEAD to be a symbolic link.
+# Enable it on Windows.  By default, symrefs are still used.
+#
+# Define NO_SVN_TESTS if you want to skip time-consuming SVN interoperability
+# tests.  These tests take up a significant amount of the total test time
+# but are not needed unless you plan to talk to SVN repos.
+#
+# Define NO_FINK if you are building on Darwin/Mac OS X, have Fink
+# installed in /sw, but don't want PERF to link against any libraries
+# installed there.  If defined you may specify your own (or Fink's)
+# include directories and library directories by defining CFLAGS
+# and LDFLAGS appropriately.
+#
+# Define NO_DARWIN_PORTS if you are building on Darwin/Mac OS X,
+# have DarwinPorts installed in /opt/local, but don't want PERF to
+# link against any libraries installed there.  If defined you may
+# specify your own (or DarwinPort's) include directories and
+# library directories by defining CFLAGS and LDFLAGS appropriately.
+#
+# Define PPC_SHA1 environment variable when running make to make use of
+# a bundled SHA1 routine optimized for PowerPC.
+#
+# Define ARM_SHA1 environment variable when running make to make use of
+# a bundled SHA1 routine optimized for ARM.
+#
+# Define MOZILLA_SHA1 environment variable when running make to make use of
+# a bundled SHA1 routine coming from Mozilla. It is GPL'd and should be fast
+# on non-x86 architectures (e.g. PowerPC), while the OpenSSL version (default
+# choice) has very fast version optimized for i586.
+#
+# Define NEEDS_SSL_WITH_CRYPTO if you need -lcrypto with -lssl (Darwin).
+#
+# Define NEEDS_LIBICONV if linking with libc is not enough (Darwin).
+#
+# Define NEEDS_SOCKET if linking with libc is not enough (SunOS,
+# Patrick Mauritz).
+#
+# Define NO_MMAP if you want to avoid mmap.
+#
+# Define NO_PTHREADS if you do not have or do not want to use Pthreads.
+#
+# Define NO_PREAD if you have a problem with pread() system call (e.g.
+# cygwin.dll before v1.5.22).
+#
+# Define NO_FAST_WORKING_DIRECTORY if accessing objects in pack files is
+# generally faster on your platform than accessing the working directory.
+#
+# Define NO_TRUSTABLE_FILEMODE if your filesystem may claim to support
+# the executable mode bit, but doesn't really do so.
+#
+# Define NO_IPV6 if you lack IPv6 support and getaddrinfo().
+#
+# Define NO_SOCKADDR_STORAGE if your platform does not have struct
+# sockaddr_storage.
+#
+# Define NO_ICONV if your libc does not properly support iconv.
+#
+# Define OLD_ICONV if your library has an old iconv(), where the second
+# (input buffer pointer) parameter is declared with type (const char **).
+#
+# Define NO_DEFLATE_BOUND if your zlib does not have deflateBound.
+#
+# Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib"
+# that tells runtime paths to dynamic libraries;
+# "-Wl,-rpath=/path/lib" is used instead.
+#
+# Define USE_NSEC below if you want perf to care about sub-second file mtimes
+# and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and
+# it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely
+# randomly break unless your underlying filesystem supports those sub-second
+# times (my ext3 doesn't).
+#
+# Define USE_ST_TIMESPEC if your "struct stat" uses "st_ctimespec" instead of
+# "st_ctim"
+#
+# Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec"
+# available.  This automatically turns USE_NSEC off.
+#
+# Define USE_STDEV below if you want perf to care about the underlying device
+# change being considered an inode change from the update-index perspective.
+#
+# Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks
+# field that counts the on-disk footprint in 512-byte blocks.
+#
+# Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8
+#
+# Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72.
+#
+# Define NO_PERL_MAKEMAKER if you cannot use Makefiles generated by perl's
+# MakeMaker (e.g. using ActiveState under Cygwin).
+#
+# Define NO_PERL if you do not want Perl scripts or libraries at all.
+#
+# Define INTERNAL_QSORT to use Git's implementation of qsort(), which
+# is a simplified version of the merge sort used in glibc. This is
+# recommended if Git triggers O(n^2) behavior in your platform's qsort().
+#
+# Define NO_EXTERNAL_GREP if you don't want "perf grep" to ever call
+# your external grep (e.g., if your system lacks grep, if its grep is
+# broken, or spawning external process is slower than built-in grep perf has).
+
+PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
+       @$(SHELL_PATH) util/PERF-VERSION-GEN
+-include PERF-VERSION-FILE
+
+uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
+uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
+uname_O := $(shell sh -c 'uname -o 2>/dev/null || echo not')
+uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not')
+uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not')
+uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
+
+# CFLAGS and LDFLAGS are for the users to override from the command line.
+
+CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6
+LDFLAGS = -lpthread -lrt -lelf
+ALL_CFLAGS = $(CFLAGS)
+ALL_LDFLAGS = $(LDFLAGS)
+STRIP ?= strip
+
+# Among the variables below, these:
+#   perfexecdir
+#   template_dir
+#   mandir
+#   infodir
+#   htmldir
+#   ETC_PERFCONFIG (but not sysconfdir)
+# can be specified as a relative path some/where/else;
+# this is interpreted as relative to $(prefix) and "perf" at
+# runtime figures out where they are based on the path to the executable.
+# This can help installing the suite in a relocatable way.
+
+prefix = $(HOME)
+bindir_relative = bin
+bindir = $(prefix)/$(bindir_relative)
+mandir = share/man
+infodir = share/info
+perfexecdir = libexec/perf-core
+sharedir = $(prefix)/share
+template_dir = share/perf-core/templates
+htmldir = share/doc/perf-doc
+ifeq ($(prefix),/usr)
+sysconfdir = /etc
+ETC_PERFCONFIG = $(sysconfdir)/perfconfig
+else
+sysconfdir = $(prefix)/etc
+ETC_PERFCONFIG = etc/perfconfig
+endif
+lib = lib
+# DESTDIR=
+
+export prefix bindir sharedir sysconfdir
+
+CC = gcc
+AR = ar
+RM = rm -f
+TAR = tar
+FIND = find
+INSTALL = install
+RPMBUILD = rpmbuild
+PTHREAD_LIBS = -lpthread
+
+# sparse is architecture-neutral, which means that we need to tell it
+# explicitly what architecture to check for. Fix this up for yours..
+SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
+
+
+
+### --- END CONFIGURATION SECTION ---
+
+# Those must not be GNU-specific; they are shared with perl/ which may
+# be built by a different compiler. (Note that this is an artifact now
+# but it still might be nice to keep that distinction.)
+BASIC_CFLAGS =
+BASIC_LDFLAGS =
+
+# Guard against environment variables
+BUILTIN_OBJS =
+BUILT_INS =
+COMPAT_CFLAGS =
+COMPAT_OBJS =
+LIB_H =
+LIB_OBJS =
+SCRIPT_PERL =
+SCRIPT_SH =
+TEST_PROGRAMS =
+
+#
+# No scripts right now:
+#
+
+# SCRIPT_SH += perf-am.sh
+
+#
+# No Perl scripts right now:
+#
+
+# SCRIPT_PERL += perf-add--interactive.perl
+
+SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \
+         $(patsubst %.perl,%,$(SCRIPT_PERL))
+
+# Empty...
+EXTRA_PROGRAMS =
+
+# ... and all the rest that could be moved out of bindir to perfexecdir
+PROGRAMS += $(EXTRA_PROGRAMS)
+
+#
+# Single 'perf' binary right now:
+#
+PROGRAMS += perf
+
+# List built-in command $C whose implementation cmd_$C() is not in
+# builtin-$C.o but is linked in as part of some other command.
+#
+# None right now:
+#
+# BUILT_INS += perf-init $X
+
+# what 'all' will build and 'install' will install, in perfexecdir
+ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
+
+# what 'all' will build but not install in perfexecdir
+OTHER_PROGRAMS = perf$X
+
+# Set paths to tools early so that they can be used for version tests.
+ifndef SHELL_PATH
+       SHELL_PATH = /bin/sh
+endif
+ifndef PERL_PATH
+       PERL_PATH = /usr/bin/perl
+endif
+
+export PERL_PATH
+
+LIB_FILE=libperf.a
+
+LIB_H += ../../include/linux/perf_counter.h
+LIB_H += perf.h
+LIB_H += util/list.h
+LIB_H += util/rbtree.h
+LIB_H += util/levenshtein.h
+LIB_H += util/parse-options.h
+LIB_H += util/parse-events.h
+LIB_H += util/quote.h
+LIB_H += util/util.h
+LIB_H += util/help.h
+LIB_H += util/strbuf.h
+LIB_H += util/string.h
+LIB_H += util/run-command.h
+LIB_H += util/sigchain.h
+LIB_H += util/symbol.h
+LIB_H += util/color.h
+
+LIB_OBJS += util/abspath.o
+LIB_OBJS += util/alias.o
+LIB_OBJS += util/config.o
+LIB_OBJS += util/ctype.o
+LIB_OBJS += util/environment.o
+LIB_OBJS += util/exec_cmd.o
+LIB_OBJS += util/help.o
+LIB_OBJS += util/levenshtein.o
+LIB_OBJS += util/parse-options.o
+LIB_OBJS += util/parse-events.o
+LIB_OBJS += util/path.o
+LIB_OBJS += util/rbtree.o
+LIB_OBJS += util/run-command.o
+LIB_OBJS += util/quote.o
+LIB_OBJS += util/strbuf.o
+LIB_OBJS += util/string.o
+LIB_OBJS += util/usage.o
+LIB_OBJS += util/wrapper.o
+LIB_OBJS += util/sigchain.o
+LIB_OBJS += util/symbol.o
+LIB_OBJS += util/color.o
+LIB_OBJS += util/pager.o
+
+BUILTIN_OBJS += builtin-annotate.o
+BUILTIN_OBJS += builtin-help.o
+BUILTIN_OBJS += builtin-list.o
+BUILTIN_OBJS += builtin-record.o
+BUILTIN_OBJS += builtin-report.o
+BUILTIN_OBJS += builtin-stat.o
+BUILTIN_OBJS += builtin-top.o
+
+PERFLIBS = $(LIB_FILE)
+EXTLIBS =
+
+#
+# Platform specific tweaks
+#
+
+# We choose to avoid "if .. else if .. else .. endif endif"
+# because maintaining the nesting to match is a pain.  If
+# we had "elif" things would have been much nicer...
+
+-include config.mak.autogen
+-include config.mak
+
+ifeq ($(uname_S),Darwin)
+       ifndef NO_FINK
+               ifeq ($(shell test -d /sw/lib && echo y),y)
+                       BASIC_CFLAGS += -I/sw/include
+                       BASIC_LDFLAGS += -L/sw/lib
+               endif
+       endif
+       ifndef NO_DARWIN_PORTS
+               ifeq ($(shell test -d /opt/local/lib && echo y),y)
+                       BASIC_CFLAGS += -I/opt/local/include
+                       BASIC_LDFLAGS += -L/opt/local/lib
+               endif
+       endif
+       PTHREAD_LIBS =
+endif
+
+ifndef CC_LD_DYNPATH
+       ifdef NO_R_TO_GCC_LINKER
+               # Some gcc does not accept and pass -R to the linker to specify
+               # the runtime dynamic library path.
+               CC_LD_DYNPATH = -Wl,-rpath,
+       else
+               CC_LD_DYNPATH = -R
+       endif
+endif
+
+ifdef ZLIB_PATH
+       BASIC_CFLAGS += -I$(ZLIB_PATH)/include
+       EXTLIBS += -L$(ZLIB_PATH)/$(lib) $(CC_LD_DYNPATH)$(ZLIB_PATH)/$(lib)
+endif
+EXTLIBS += -lz
+
+ifdef NEEDS_SOCKET
+       EXTLIBS += -lsocket
+endif
+ifdef NEEDS_NSL
+       EXTLIBS += -lnsl
+endif
+ifdef NO_D_TYPE_IN_DIRENT
+       BASIC_CFLAGS += -DNO_D_TYPE_IN_DIRENT
+endif
+ifdef NO_D_INO_IN_DIRENT
+       BASIC_CFLAGS += -DNO_D_INO_IN_DIRENT
+endif
+ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
+       BASIC_CFLAGS += -DNO_ST_BLOCKS_IN_STRUCT_STAT
+endif
+ifdef USE_NSEC
+       BASIC_CFLAGS += -DUSE_NSEC
+endif
+ifdef USE_ST_TIMESPEC
+       BASIC_CFLAGS += -DUSE_ST_TIMESPEC
+endif
+ifdef NO_NSEC
+       BASIC_CFLAGS += -DNO_NSEC
+endif
+ifdef NO_C99_FORMAT
+       BASIC_CFLAGS += -DNO_C99_FORMAT
+endif
+ifdef SNPRINTF_RETURNS_BOGUS
+       COMPAT_CFLAGS += -DSNPRINTF_RETURNS_BOGUS
+       COMPAT_OBJS += compat/snprintf.o
+endif
+ifdef FREAD_READS_DIRECTORIES
+       COMPAT_CFLAGS += -DFREAD_READS_DIRECTORIES
+       COMPAT_OBJS += compat/fopen.o
+endif
+ifdef NO_SYMLINK_HEAD
+       BASIC_CFLAGS += -DNO_SYMLINK_HEAD
+endif
+ifdef NO_STRCASESTR
+       COMPAT_CFLAGS += -DNO_STRCASESTR
+       COMPAT_OBJS += compat/strcasestr.o
+endif
+ifdef NO_STRTOUMAX
+       COMPAT_CFLAGS += -DNO_STRTOUMAX
+       COMPAT_OBJS += compat/strtoumax.o
+endif
+ifdef NO_STRTOULL
+       COMPAT_CFLAGS += -DNO_STRTOULL
+endif
+ifdef NO_SETENV
+       COMPAT_CFLAGS += -DNO_SETENV
+       COMPAT_OBJS += compat/setenv.o
+endif
+ifdef NO_MKDTEMP
+       COMPAT_CFLAGS += -DNO_MKDTEMP
+       COMPAT_OBJS += compat/mkdtemp.o
+endif
+ifdef NO_UNSETENV
+       COMPAT_CFLAGS += -DNO_UNSETENV
+       COMPAT_OBJS += compat/unsetenv.o
+endif
+ifdef NO_SYS_SELECT_H
+       BASIC_CFLAGS += -DNO_SYS_SELECT_H
+endif
+ifdef NO_MMAP
+       COMPAT_CFLAGS += -DNO_MMAP
+       COMPAT_OBJS += compat/mmap.o
+else
+       ifdef USE_WIN32_MMAP
+               COMPAT_CFLAGS += -DUSE_WIN32_MMAP
+               COMPAT_OBJS += compat/win32mmap.o
+       endif
+endif
+ifdef NO_PREAD
+       COMPAT_CFLAGS += -DNO_PREAD
+       COMPAT_OBJS += compat/pread.o
+endif
+ifdef NO_FAST_WORKING_DIRECTORY
+       BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY
+endif
+ifdef NO_TRUSTABLE_FILEMODE
+       BASIC_CFLAGS += -DNO_TRUSTABLE_FILEMODE
+endif
+ifdef NO_IPV6
+       BASIC_CFLAGS += -DNO_IPV6
+endif
+ifdef NO_UINTMAX_T
+       BASIC_CFLAGS += -Duintmax_t=uint32_t
+endif
+ifdef NO_SOCKADDR_STORAGE
+ifdef NO_IPV6
+       BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in
+else
+       BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in6
+endif
+endif
+ifdef NO_INET_NTOP
+       LIB_OBJS += compat/inet_ntop.o
+endif
+ifdef NO_INET_PTON
+       LIB_OBJS += compat/inet_pton.o
+endif
+
+ifdef NO_ICONV
+       BASIC_CFLAGS += -DNO_ICONV
+endif
+
+ifdef OLD_ICONV
+       BASIC_CFLAGS += -DOLD_ICONV
+endif
+
+ifdef NO_DEFLATE_BOUND
+       BASIC_CFLAGS += -DNO_DEFLATE_BOUND
+endif
+
+ifdef PPC_SHA1
+       SHA1_HEADER = "ppc/sha1.h"
+       LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o
+else
+ifdef ARM_SHA1
+       SHA1_HEADER = "arm/sha1.h"
+       LIB_OBJS += arm/sha1.o arm/sha1_arm.o
+else
+ifdef MOZILLA_SHA1
+       SHA1_HEADER = "mozilla-sha1/sha1.h"
+       LIB_OBJS += mozilla-sha1/sha1.o
+else
+       SHA1_HEADER = <openssl/sha.h>
+       EXTLIBS += $(LIB_4_CRYPTO)
+endif
+endif
+endif
+ifdef NO_PERL_MAKEMAKER
+       export NO_PERL_MAKEMAKER
+endif
+ifdef NO_HSTRERROR
+       COMPAT_CFLAGS += -DNO_HSTRERROR
+       COMPAT_OBJS += compat/hstrerror.o
+endif
+ifdef NO_MEMMEM
+       COMPAT_CFLAGS += -DNO_MEMMEM
+       COMPAT_OBJS += compat/memmem.o
+endif
+ifdef INTERNAL_QSORT
+       COMPAT_CFLAGS += -DINTERNAL_QSORT
+       COMPAT_OBJS += compat/qsort.o
+endif
+ifdef RUNTIME_PREFIX
+       COMPAT_CFLAGS += -DRUNTIME_PREFIX
+endif
+
+ifdef DIR_HAS_BSD_GROUP_SEMANTICS
+       COMPAT_CFLAGS += -DDIR_HAS_BSD_GROUP_SEMANTICS
+endif
+ifdef NO_EXTERNAL_GREP
+       BASIC_CFLAGS += -DNO_EXTERNAL_GREP
+endif
+
+ifeq ($(PERL_PATH),)
+NO_PERL=NoThanks
+endif
+
+QUIET_SUBDIR0  = +$(MAKE) -C # space to separate -C and subdir
+QUIET_SUBDIR1  =
+
+ifneq ($(findstring $(MAKEFLAGS),w),w)
+PRINT_DIR = --no-print-directory
+else # "make -w"
+NO_SUBDIR = :
+endif
+
+ifneq ($(findstring $(MAKEFLAGS),s),s)
+ifndef V
+       QUIET_CC       = @echo '   ' CC $@;
+       QUIET_AR       = @echo '   ' AR $@;
+       QUIET_LINK     = @echo '   ' LINK $@;
+       QUIET_BUILT_IN = @echo '   ' BUILTIN $@;
+       QUIET_GEN      = @echo '   ' GEN $@;
+       QUIET_SUBDIR0  = +@subdir=
+       QUIET_SUBDIR1  = ;$(NO_SUBDIR) echo '   ' SUBDIR $$subdir; \
+                        $(MAKE) $(PRINT_DIR) -C $$subdir
+       export V
+       export QUIET_GEN
+       export QUIET_BUILT_IN
+endif
+endif
+
+ifdef ASCIIDOC8
+       export ASCIIDOC8
+endif
+
+# Shell quote (do not use $(call) to accommodate ancient setups);
+
+SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER))
+ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG))
+
+DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
+bindir_SQ = $(subst ','\'',$(bindir))
+bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
+mandir_SQ = $(subst ','\'',$(mandir))
+infodir_SQ = $(subst ','\'',$(infodir))
+perfexecdir_SQ = $(subst ','\'',$(perfexecdir))
+template_dir_SQ = $(subst ','\'',$(template_dir))
+htmldir_SQ = $(subst ','\'',$(htmldir))
+prefix_SQ = $(subst ','\'',$(prefix))
+
+SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
+PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
+
+LIBS = $(PERFLIBS) $(EXTLIBS)
+
+BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
+       $(COMPAT_CFLAGS)
+LIB_OBJS += $(COMPAT_OBJS)
+
+ALL_CFLAGS += $(BASIC_CFLAGS)
+ALL_LDFLAGS += $(BASIC_LDFLAGS)
+
+export TAR INSTALL DESTDIR SHELL_PATH
+
+
+### Build rules
+
+SHELL = $(SHELL_PATH)
+
+all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) PERF-BUILD-OPTIONS
+ifneq (,$X)
+       $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), test '$p' -ef '$p$X' || $(RM) '$p';)
+endif
+
+all::
+
+please_set_SHELL_PATH_to_a_more_modern_shell:
+       @$$(:)
+
+shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
+
+strip: $(PROGRAMS) perf$X
+       $(STRIP) $(STRIP_OPTS) $(PROGRAMS) perf$X
+
+perf.o: perf.c common-cmds.h PERF-CFLAGS
+       $(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \
+               '-DPERF_HTML_PATH="$(htmldir_SQ)"' \
+               $(ALL_CFLAGS) -c $(filter %.c,$^)
+
+perf$X: perf.o $(BUILTIN_OBJS) $(PERFLIBS)
+       $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ perf.o \
+               $(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS)
+
+builtin-help.o: builtin-help.c common-cmds.h PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
+               '-DPERF_HTML_PATH="$(htmldir_SQ)"' \
+               '-DPERF_MAN_PATH="$(mandir_SQ)"' \
+               '-DPERF_INFO_PATH="$(infodir_SQ)"' $<
+
+$(BUILT_INS): perf$X
+       $(QUIET_BUILT_IN)$(RM) $@ && \
+       ln perf$X $@ 2>/dev/null || \
+       ln -s perf$X $@ 2>/dev/null || \
+       cp perf$X $@
+
+common-cmds.h: util/generate-cmdlist.sh command-list.txt
+
+common-cmds.h: $(wildcard Documentation/perf-*.txt)
+       $(QUIET_GEN)util/generate-cmdlist.sh > $@+ && mv $@+ $@
+
+$(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
+       $(QUIET_GEN)$(RM) $@ $@+ && \
+       sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
+           -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \
+           -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \
+           -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
+           -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
+           $@.sh >$@+ && \
+       chmod +x $@+ && \
+       mv $@+ $@
+
+configure: configure.ac
+       $(QUIET_GEN)$(RM) $@ $<+ && \
+       sed -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
+           $< > $<+ && \
+       autoconf -o $@ $<+ && \
+       $(RM) $<+
+
+# These can record PERF_VERSION
+perf.o perf.spec \
+       $(patsubst %.sh,%,$(SCRIPT_SH)) \
+       $(patsubst %.perl,%,$(SCRIPT_PERL)) \
+       : PERF-VERSION-FILE
+
+%.o: %.c PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
+%.s: %.c PERF-CFLAGS
+       $(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $<
+%.o: %.S
+       $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
+
+util/exec_cmd.o: util/exec_cmd.c PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
+               '-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \
+               '-DBINDIR="$(bindir_relative_SQ)"' \
+               '-DPREFIX="$(prefix_SQ)"' \
+               $<
+
+builtin-init-db.o: builtin-init-db.c PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
+
+util/config.o: util/config.c PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
+
+perf-%$X: %.o $(PERFLIBS)
+       $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
+
+$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
+$(patsubst perf-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
+builtin-revert.o wt-status.o: wt-status.h
+
+$(LIB_FILE): $(LIB_OBJS)
+       $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
+
+doc:
+       $(MAKE) -C Documentation all
+
+man:
+       $(MAKE) -C Documentation man
+
+html:
+       $(MAKE) -C Documentation html
+
+info:
+       $(MAKE) -C Documentation info
+
+pdf:
+       $(MAKE) -C Documentation pdf
+
+TAGS:
+       $(RM) TAGS
+       $(FIND) . -name '*.[hcS]' -print | xargs etags -a
+
+tags:
+       $(RM) tags
+       $(FIND) . -name '*.[hcS]' -print | xargs ctags -a
+
+cscope:
+       $(RM) cscope*
+       $(FIND) . -name '*.[hcS]' -print | xargs cscope -b
+
+### Detect prefix changes
+TRACK_CFLAGS = $(subst ','\'',$(ALL_CFLAGS)):\
+             $(bindir_SQ):$(perfexecdir_SQ):$(template_dir_SQ):$(prefix_SQ)
+
+PERF-CFLAGS: .FORCE-PERF-CFLAGS
+       @FLAGS='$(TRACK_CFLAGS)'; \
+           if test x"$$FLAGS" != x"`cat PERF-CFLAGS 2>/dev/null`" ; then \
+               echo 1>&2 "    * new build flags or prefix"; \
+               echo "$$FLAGS" >PERF-CFLAGS; \
+            fi
+
+# We need to apply sq twice, once to protect from the shell
+# that runs PERF-BUILD-OPTIONS, and then again to protect it
+# and the first level quoting from the shell that runs "echo".
+PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS
+       @echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@
+       @echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@
+       @echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@
+       @echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@
+
+### Testing rules
+
+#
+# None right now:
+#
+# TEST_PROGRAMS += test-something$X
+
+all:: $(TEST_PROGRAMS)
+
+# GNU make supports exporting all variables by "export" without parameters.
+# However, the environment gets quite big, and some programs have problems
+# with that.
+
+export NO_SVN_TESTS
+
+check: common-cmds.h
+       if sparse; \
+       then \
+               for i in *.c */*.c; \
+               do \
+                       sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \
+               done; \
+       else \
+               echo 2>&1 "Did you mean 'make test'?"; \
+               exit 1; \
+       fi
+
+remove-dashes:
+       ./fixup-builtins $(BUILT_INS) $(PROGRAMS) $(SCRIPTS)
+
+### Installation rules
+
+ifneq ($(filter /%,$(firstword $(template_dir))),)
+template_instdir = $(template_dir)
+else
+template_instdir = $(prefix)/$(template_dir)
+endif
+export template_instdir
+
+ifneq ($(filter /%,$(firstword $(perfexecdir))),)
+perfexec_instdir = $(perfexecdir)
+else
+perfexec_instdir = $(prefix)/$(perfexecdir)
+endif
+perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir))
+export perfexec_instdir
+
+install: all
+       $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
+       $(INSTALL) perf$X '$(DESTDIR_SQ)$(bindir_SQ)'
+ifdef BUILT_INS
+       $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+       $(INSTALL) $(BUILT_INS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+ifneq (,$X)
+       $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
+endif
+endif
+
+install-doc:
+       $(MAKE) -C Documentation install
+
+install-man:
+       $(MAKE) -C Documentation install-man
+
+install-html:
+       $(MAKE) -C Documentation install-html
+
+install-info:
+       $(MAKE) -C Documentation install-info
+
+install-pdf:
+       $(MAKE) -C Documentation install-pdf
+
+quick-install-doc:
+       $(MAKE) -C Documentation quick-install
+
+quick-install-man:
+       $(MAKE) -C Documentation quick-install-man
+
+quick-install-html:
+       $(MAKE) -C Documentation quick-install-html
+
+
+### Maintainer's dist rules
+#
+# None right now
+#
+#
+# perf.spec: perf.spec.in
+#      sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+
+#      mv $@+ $@
+#
+# PERF_TARNAME=perf-$(PERF_VERSION)
+# dist: perf.spec perf-archive$(X) configure
+#      ./perf-archive --format=tar \
+#              --prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar
+#      @mkdir -p $(PERF_TARNAME)
+#      @cp perf.spec configure $(PERF_TARNAME)
+#      @echo $(PERF_VERSION) > $(PERF_TARNAME)/version
+#      $(TAR) rf $(PERF_TARNAME).tar \
+#              $(PERF_TARNAME)/perf.spec \
+#              $(PERF_TARNAME)/configure \
+#              $(PERF_TARNAME)/version
+#      @$(RM) -r $(PERF_TARNAME)
+#      gzip -f -9 $(PERF_TARNAME).tar
+#
+# htmldocs = perf-htmldocs-$(PERF_VERSION)
+# manpages = perf-manpages-$(PERF_VERSION)
+# dist-doc:
+#      $(RM) -r .doc-tmp-dir
+#      mkdir .doc-tmp-dir
+#      $(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc
+#      cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar .
+#      gzip -n -9 -f $(htmldocs).tar
+#      :
+#      $(RM) -r .doc-tmp-dir
+#      mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7
+#      $(MAKE) -C Documentation DESTDIR=./ \
+#              man1dir=../.doc-tmp-dir/man1 \
+#              man5dir=../.doc-tmp-dir/man5 \
+#              man7dir=../.doc-tmp-dir/man7 \
+#              install
+#      cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar .
+#      gzip -n -9 -f $(manpages).tar
+#      $(RM) -r .doc-tmp-dir
+#
+# rpm: dist
+#      $(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
+
+### Cleaning rules
+
+distclean: clean
+#      $(RM) configure
+
+clean:
+       $(RM) *.o */*.o $(LIB_FILE)
+       $(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X
+       $(RM) $(TEST_PROGRAMS)
+       $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope*
+       $(RM) -r autom4te.cache
+       $(RM) config.log config.mak.autogen config.mak.append config.status config.cache
+       $(RM) -r $(PERF_TARNAME) .doc-tmp-dir
+       $(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz
+       $(RM) $(htmldocs).tar.gz $(manpages).tar.gz
+       $(MAKE) -C Documentation/ clean
+       $(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS
+
+.PHONY: all install clean strip
+.PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
+.PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
+.PHONY: .FORCE-PERF-BUILD-OPTIONS
+
+### Make sure built-ins do not have dups and listed in perf.c
+#
+check-builtins::
+       ./check-builtins.sh
+
+### Test suite coverage testing
+#
+# None right now
+#
+# .PHONY: coverage coverage-clean coverage-build coverage-report
+#
+# coverage:
+#      $(MAKE) coverage-build
+#      $(MAKE) coverage-report
+#
+# coverage-clean:
+#      rm -f *.gcda *.gcno
+#
+# COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs
+# COVERAGE_LDFLAGS = $(CFLAGS)  -O0 -lgcov
+#
+# coverage-build: coverage-clean
+#      $(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all
+#      $(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \
+#              -j1 test
+#
+# coverage-report:
+#      gcov -b *.c */*.c
+#      grep '^function.*called 0 ' *.c.gcov */*.c.gcov \
+#              | sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \
+#              | tee coverage-untested-functions
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c

new file mode 100644 (file)

index 0000000..b1ed5f7
--- /dev/null
+++ b/tools/perf/builtin-annotate.c
@@ -0,0 +1,1356 @@
+/*
+ * builtin-annotate.c
+ *
+ * Builtin annotate command: Analyze the perf.data input file,
+ * look up and read DSOs and symbol information and display
+ * a histogram of results, along various sorting keys.
+ */
+#include "builtin.h"
+
+#include "util/util.h"
+
+#include "util/color.h"
+#include "util/list.h"
+#include "util/cache.h"
+#include "util/rbtree.h"
+#include "util/symbol.h"
+#include "util/string.h"
+
+#include "perf.h"
+
+#include "util/parse-options.h"
+#include "util/parse-events.h"
+
+#define SHOW_KERNEL    1
+#define SHOW_USER      2
+#define SHOW_HV                4
+
+static char            const *input_name = "perf.data";
+static char            *vmlinux = "vmlinux";
+
+static char            default_sort_order[] = "comm,symbol";
+static char            *sort_order = default_sort_order;
+
+static int             input;
+static int             show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
+
+static int             dump_trace = 0;
+#define dprintf(x...)  do { if (dump_trace) printf(x); } while (0)
+
+static int             verbose;
+
+static unsigned long   page_size;
+static unsigned long   mmap_window = 32;
+
+struct ip_event {
+       struct perf_event_header header;
+       __u64 ip;
+       __u32 pid, tid;
+};
+
+struct mmap_event {
+       struct perf_event_header header;
+       __u32 pid, tid;
+       __u64 start;
+       __u64 len;
+       __u64 pgoff;
+       char filename[PATH_MAX];
+};
+
+struct comm_event {
+       struct perf_event_header header;
+       __u32 pid, tid;
+       char comm[16];
+};
+
+struct fork_event {
+       struct perf_event_header header;
+       __u32 pid, ppid;
+};
+
+struct period_event {
+       struct perf_event_header header;
+       __u64 time;
+       __u64 id;
+       __u64 sample_period;
+};
+
+typedef union event_union {
+       struct perf_event_header        header;
+       struct ip_event                 ip;
+       struct mmap_event               mmap;
+       struct comm_event               comm;
+       struct fork_event               fork;
+       struct period_event             period;
+} event_t;
+
+static LIST_HEAD(dsos);
+static struct dso *kernel_dso;
+static struct dso *vdso;
+
+
+static void dsos__add(struct dso *dso)
+{
+       list_add_tail(&dso->node, &dsos);
+}
+
+static struct dso *dsos__find(const char *name)
+{
+       struct dso *pos;
+
+       list_for_each_entry(pos, &dsos, node)
+               if (strcmp(pos->name, name) == 0)
+                       return pos;
+       return NULL;
+}
+
+static struct dso *dsos__findnew(const char *name)
+{
+       struct dso *dso = dsos__find(name);
+       int nr;
+
+       if (dso)
+               return dso;
+
+       dso = dso__new(name, 0);
+       if (!dso)
+               goto out_delete_dso;
+
+       nr = dso__load(dso, NULL, verbose);
+       if (nr < 0) {
+               if (verbose)
+                       fprintf(stderr, "Failed to open: %s\n", name);
+               goto out_delete_dso;
+       }
+       if (!nr && verbose) {
+               fprintf(stderr,
+               "No symbols found in: %s, maybe install a debug package?\n",
+                               name);
+       }
+
+       dsos__add(dso);
+
+       return dso;
+
+out_delete_dso:
+       dso__delete(dso);
+       return NULL;
+}
+
+static void dsos__fprintf(FILE *fp)
+{
+       struct dso *pos;
+
+       list_for_each_entry(pos, &dsos, node)
+               dso__fprintf(pos, fp);
+}
+
+static struct symbol *vdso__find_symbol(struct dso *dso, __u64 ip)
+{
+       return dso__find_symbol(kernel_dso, ip);
+}
+
+static int load_kernel(void)
+{
+       int err;
+
+       kernel_dso = dso__new("[kernel]", 0);
+       if (!kernel_dso)
+               return -1;
+
+       err = dso__load_kernel(kernel_dso, vmlinux, NULL, verbose);
+       if (err) {
+               dso__delete(kernel_dso);
+               kernel_dso = NULL;
+       } else
+               dsos__add(kernel_dso);
+
+       vdso = dso__new("[vdso]", 0);
+       if (!vdso)
+               return -1;
+
+       vdso->find_symbol = vdso__find_symbol;
+
+       dsos__add(vdso);
+
+       return err;
+}
+
+struct map {
+       struct list_head node;
+       __u64    start;
+       __u64    end;
+       __u64    pgoff;
+       __u64    (*map_ip)(struct map *, __u64);
+       struct dso       *dso;
+};
+
+static __u64 map__map_ip(struct map *map, __u64 ip)
+{
+       return ip - map->start + map->pgoff;
+}
+
+static __u64 vdso__map_ip(struct map *map, __u64 ip)
+{
+       return ip;
+}
+
+static struct map *map__new(struct mmap_event *event)
+{
+       struct map *self = malloc(sizeof(*self));
+
+       if (self != NULL) {
+               const char *filename = event->filename;
+
+               self->start = event->start;
+               self->end   = event->start + event->len;
+               self->pgoff = event->pgoff;
+
+               self->dso = dsos__findnew(filename);
+               if (self->dso == NULL)
+                       goto out_delete;
+
+               if (self->dso == vdso)
+                       self->map_ip = vdso__map_ip;
+               else
+                       self->map_ip = map__map_ip;
+       }
+       return self;
+out_delete:
+       free(self);
+       return NULL;
+}
+
+static struct map *map__clone(struct map *self)
+{
+       struct map *map = malloc(sizeof(*self));
+
+       if (!map)
+               return NULL;
+
+       memcpy(map, self, sizeof(*self));
+
+       return map;
+}
+
+static int map__overlap(struct map *l, struct map *r)
+{
+       if (l->start > r->start) {
+               struct map *t = l;
+               l = r;
+               r = t;
+       }
+
+       if (l->end > r->start)
+               return 1;
+
+       return 0;
+}
+
+static size_t map__fprintf(struct map *self, FILE *fp)
+{
+       return fprintf(fp, " %Lx-%Lx %Lx %s\n",
+                      self->start, self->end, self->pgoff, self->dso->name);
+}
+
+
+struct thread {
+       struct rb_node   rb_node;
+       struct list_head maps;
+       pid_t            pid;
+       char             *comm;
+};
+
+static struct thread *thread__new(pid_t pid)
+{
+       struct thread *self = malloc(sizeof(*self));
+
+       if (self != NULL) {
+               self->pid = pid;
+               self->comm = malloc(32);
+               if (self->comm)
+                       snprintf(self->comm, 32, ":%d", self->pid);
+               INIT_LIST_HEAD(&self->maps);
+       }
+
+       return self;
+}
+
+static int thread__set_comm(struct thread *self, const char *comm)
+{
+       if (self->comm)
+               free(self->comm);
+       self->comm = strdup(comm);
+       return self->comm ? 0 : -ENOMEM;
+}
+
+static size_t thread__fprintf(struct thread *self, FILE *fp)
+{
+       struct map *pos;
+       size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm);
+
+       list_for_each_entry(pos, &self->maps, node)
+               ret += map__fprintf(pos, fp);
+
+       return ret;
+}
+
+
+static struct rb_root threads;
+static struct thread *last_match;
+
+static struct thread *threads__findnew(pid_t pid)
+{
+       struct rb_node **p = &threads.rb_node;
+       struct rb_node *parent = NULL;
+       struct thread *th;
+
+       /*
+        * Font-end cache - PID lookups come in blocks,
+        * so most of the time we dont have to look up
+        * the full rbtree:
+        */
+       if (last_match && last_match->pid == pid)
+               return last_match;
+
+       while (*p != NULL) {
+               parent = *p;
+               th = rb_entry(parent, struct thread, rb_node);
+
+               if (th->pid == pid) {
+                       last_match = th;
+                       return th;
+               }
+
+               if (pid < th->pid)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
+       }
+
+       th = thread__new(pid);
+       if (th != NULL) {
+               rb_link_node(&th->rb_node, parent, p);
+               rb_insert_color(&th->rb_node, &threads);
+               last_match = th;
+       }
+
+       return th;
+}
+
+static void thread__insert_map(struct thread *self, struct map *map)
+{
+       struct map *pos, *tmp;
+
+       list_for_each_entry_safe(pos, tmp, &self->maps, node) {
+               if (map__overlap(pos, map)) {
+                       list_del_init(&pos->node);
+                       /* XXX leaks dsos */
+                       free(pos);
+               }
+       }
+
+       list_add_tail(&map->node, &self->maps);
+}
+
+static int thread__fork(struct thread *self, struct thread *parent)
+{
+       struct map *map;
+
+       if (self->comm)
+               free(self->comm);
+       self->comm = strdup(parent->comm);
+       if (!self->comm)
+               return -ENOMEM;
+
+       list_for_each_entry(map, &parent->maps, node) {
+               struct map *new = map__clone(map);
+               if (!new)
+                       return -ENOMEM;
+               thread__insert_map(self, new);
+       }
+
+       return 0;
+}
+
+static struct map *thread__find_map(struct thread *self, __u64 ip)
+{
+       struct map *pos;
+
+       if (self == NULL)
+               return NULL;
+
+       list_for_each_entry(pos, &self->maps, node)
+               if (ip >= pos->start && ip <= pos->end)
+                       return pos;
+
+       return NULL;
+}
+
+static size_t threads__fprintf(FILE *fp)
+{
+       size_t ret = 0;
+       struct rb_node *nd;
+
+       for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
+               struct thread *pos = rb_entry(nd, struct thread, rb_node);
+
+               ret += thread__fprintf(pos, fp);
+       }
+
+       return ret;
+}
+
+/*
+ * histogram, sorted on item, collects counts
+ */
+
+static struct rb_root hist;
+
+struct hist_entry {
+       struct rb_node   rb_node;
+
+       struct thread    *thread;
+       struct map       *map;
+       struct dso       *dso;
+       struct symbol    *sym;
+       __u64    ip;
+       char             level;
+
+       uint32_t         count;
+};
+
+/*
+ * configurable sorting bits
+ */
+
+struct sort_entry {
+       struct list_head list;
+
+       char *header;
+
+       int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
+       int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
+       size_t  (*print)(FILE *fp, struct hist_entry *);
+};
+
+/* --sort pid */
+
+static int64_t
+sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       return right->thread->pid - left->thread->pid;
+}
+
+static size_t
+sort__thread_print(FILE *fp, struct hist_entry *self)
+{
+       return fprintf(fp, "%16s:%5d", self->thread->comm ?: "", self->thread->pid);
+}
+
+static struct sort_entry sort_thread = {
+       .header = "         Command:  Pid",
+       .cmp    = sort__thread_cmp,
+       .print  = sort__thread_print,
+};
+
+/* --sort comm */
+
+static int64_t
+sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       return right->thread->pid - left->thread->pid;
+}
+
+static int64_t
+sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
+{
+       char *comm_l = left->thread->comm;
+       char *comm_r = right->thread->comm;
+
+       if (!comm_l || !comm_r) {
+               if (!comm_l && !comm_r)
+                       return 0;
+               else if (!comm_l)
+                       return -1;
+               else
+                       return 1;
+       }
+
+       return strcmp(comm_l, comm_r);
+}
+
+static size_t
+sort__comm_print(FILE *fp, struct hist_entry *self)
+{
+       return fprintf(fp, "%16s", self->thread->comm);
+}
+
+static struct sort_entry sort_comm = {
+       .header         = "         Command",
+       .cmp            = sort__comm_cmp,
+       .collapse       = sort__comm_collapse,
+       .print          = sort__comm_print,
+};
+
+/* --sort dso */
+
+static int64_t
+sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       struct dso *dso_l = left->dso;
+       struct dso *dso_r = right->dso;
+
+       if (!dso_l || !dso_r) {
+               if (!dso_l && !dso_r)
+                       return 0;
+               else if (!dso_l)
+                       return -1;
+               else
+                       return 1;
+       }
+
+       return strcmp(dso_l->name, dso_r->name);
+}
+
+static size_t
+sort__dso_print(FILE *fp, struct hist_entry *self)
+{
+       if (self->dso)
+               return fprintf(fp, "%-25s", self->dso->name);
+
+       return fprintf(fp, "%016llx         ", (__u64)self->ip);
+}
+
+static struct sort_entry sort_dso = {
+       .header = "Shared Object            ",
+       .cmp    = sort__dso_cmp,
+       .print  = sort__dso_print,
+};
+
+/* --sort symbol */
+
+static int64_t
+sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       __u64 ip_l, ip_r;
+
+       if (left->sym == right->sym)
+               return 0;
+
+       ip_l = left->sym ? left->sym->start : left->ip;
+       ip_r = right->sym ? right->sym->start : right->ip;
+
+       return (int64_t)(ip_r - ip_l);
+}
+
+static size_t
+sort__sym_print(FILE *fp, struct hist_entry *self)
+{
+       size_t ret = 0;
+
+       if (verbose)
+               ret += fprintf(fp, "%#018llx  ", (__u64)self->ip);
+
+       if (self->sym) {
+               ret += fprintf(fp, "[%c] %s",
+                       self->dso == kernel_dso ? 'k' : '.', self->sym->name);
+       } else {
+               ret += fprintf(fp, "%#016llx", (__u64)self->ip);
+       }
+
+       return ret;
+}
+
+static struct sort_entry sort_sym = {
+       .header = "Symbol",
+       .cmp    = sort__sym_cmp,
+       .print  = sort__sym_print,
+};
+
+static int sort__need_collapse = 0;
+
+struct sort_dimension {
+       char                    *name;
+       struct sort_entry       *entry;
+       int                     taken;
+};
+
+static struct sort_dimension sort_dimensions[] = {
+       { .name = "pid",        .entry = &sort_thread,  },
+       { .name = "comm",       .entry = &sort_comm,    },
+       { .name = "dso",        .entry = &sort_dso,     },
+       { .name = "symbol",     .entry = &sort_sym,     },
+};
+
+static LIST_HEAD(hist_entry__sort_list);
+
+static int sort_dimension__add(char *tok)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) {
+               struct sort_dimension *sd = &sort_dimensions[i];
+
+               if (sd->taken)
+                       continue;
+
+               if (strncasecmp(tok, sd->name, strlen(tok)))
+                       continue;
+
+               if (sd->entry->collapse)
+                       sort__need_collapse = 1;
+
+               list_add_tail(&sd->entry->list, &hist_entry__sort_list);
+               sd->taken = 1;
+
+               return 0;
+       }
+
+       return -ESRCH;
+}
+
+static int64_t
+hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       struct sort_entry *se;
+       int64_t cmp = 0;
+
+       list_for_each_entry(se, &hist_entry__sort_list, list) {
+               cmp = se->cmp(left, right);
+               if (cmp)
+                       break;
+       }
+
+       return cmp;
+}
+
+static int64_t
+hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
+{
+       struct sort_entry *se;
+       int64_t cmp = 0;
+
+       list_for_each_entry(se, &hist_entry__sort_list, list) {
+               int64_t (*f)(struct hist_entry *, struct hist_entry *);
+
+               f = se->collapse ?: se->cmp;
+
+               cmp = f(left, right);
+               if (cmp)
+                       break;
+       }
+
+       return cmp;
+}
+
+/*
+ * collect histogram counts
+ */
+static void hist_hit(struct hist_entry *he, __u64 ip)
+{
+       unsigned int sym_size, offset;
+       struct symbol *sym = he->sym;
+
+       he->count++;
+
+       if (!sym || !sym->hist)
+               return;
+
+       sym_size = sym->end - sym->start;
+       offset = ip - sym->start;
+
+       if (offset >= sym_size)
+               return;
+
+       sym->hist_sum++;
+       sym->hist[offset]++;
+
+       if (verbose >= 3)
+               printf("%p %s: count++ [ip: %p, %08Lx] => %Ld\n",
+                       (void *)(unsigned long)he->sym->start,
+                       he->sym->name,
+                       (void *)(unsigned long)ip, ip - he->sym->start,
+                       sym->hist[offset]);
+}
+
+static int
+hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
+               struct symbol *sym, __u64 ip, char level)
+{
+       struct rb_node **p = &hist.rb_node;
+       struct rb_node *parent = NULL;
+       struct hist_entry *he;
+       struct hist_entry entry = {
+               .thread = thread,
+               .map    = map,
+               .dso    = dso,
+               .sym    = sym,
+               .ip     = ip,
+               .level  = level,
+               .count  = 1,
+       };
+       int cmp;
+
+       while (*p != NULL) {
+               parent = *p;
+               he = rb_entry(parent, struct hist_entry, rb_node);
+
+               cmp = hist_entry__cmp(&entry, he);
+
+               if (!cmp) {
+                       hist_hit(he, ip);
+
+                       return 0;
+               }
+
+               if (cmp < 0)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
+       }
+
+       he = malloc(sizeof(*he));
+       if (!he)
+               return -ENOMEM;
+       *he = entry;
+       rb_link_node(&he->rb_node, parent, p);
+       rb_insert_color(&he->rb_node, &hist);
+
+       return 0;
+}
+
+static void hist_entry__free(struct hist_entry *he)
+{
+       free(he);
+}
+
+/*
+ * collapse the histogram
+ */
+
+static struct rb_root collapse_hists;
+
+static void collapse__insert_entry(struct hist_entry *he)
+{
+       struct rb_node **p = &collapse_hists.rb_node;
+       struct rb_node *parent = NULL;
+       struct hist_entry *iter;
+       int64_t cmp;
+
+       while (*p != NULL) {
+               parent = *p;
+               iter = rb_entry(parent, struct hist_entry, rb_node);
+
+               cmp = hist_entry__collapse(iter, he);
+
+               if (!cmp) {
+                       iter->count += he->count;
+                       hist_entry__free(he);
+                       return;
+               }
+
+               if (cmp < 0)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
+       }
+
+       rb_link_node(&he->rb_node, parent, p);
+       rb_insert_color(&he->rb_node, &collapse_hists);
+}
+
+static void collapse__resort(void)
+{
+       struct rb_node *next;
+       struct hist_entry *n;
+
+       if (!sort__need_collapse)
+               return;
+
+       next = rb_first(&hist);
+       while (next) {
+               n = rb_entry(next, struct hist_entry, rb_node);
+               next = rb_next(&n->rb_node);
+
+               rb_erase(&n->rb_node, &hist);
+               collapse__insert_entry(n);
+       }
+}
+
+/*
+ * reverse the map, sort on count.
+ */
+
+static struct rb_root output_hists;
+
+static void output__insert_entry(struct hist_entry *he)
+{
+       struct rb_node **p = &output_hists.rb_node;
+       struct rb_node *parent = NULL;
+       struct hist_entry *iter;
+
+       while (*p != NULL) {
+               parent = *p;
+               iter = rb_entry(parent, struct hist_entry, rb_node);
+
+               if (he->count > iter->count)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
+       }
+
+       rb_link_node(&he->rb_node, parent, p);
+       rb_insert_color(&he->rb_node, &output_hists);
+}
+
+static void output__resort(void)
+{
+       struct rb_node *next;
+       struct hist_entry *n;
+       struct rb_root *tree = &hist;
+
+       if (sort__need_collapse)
+               tree = &collapse_hists;
+
+       next = rb_first(tree);
+
+       while (next) {
+               n = rb_entry(next, struct hist_entry, rb_node);
+               next = rb_next(&n->rb_node);
+
+               rb_erase(&n->rb_node, tree);
+               output__insert_entry(n);
+       }
+}
+
+static void register_idle_thread(void)
+{
+       struct thread *thread = threads__findnew(0);
+
+       if (thread == NULL ||
+                       thread__set_comm(thread, "[idle]")) {
+               fprintf(stderr, "problem inserting idle task.\n");
+               exit(-1);
+       }
+}
+
+static unsigned long total = 0,
+                    total_mmap = 0,
+                    total_comm = 0,
+                    total_fork = 0,
+                    total_unknown = 0;
+
+static int
+process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
+{
+       char level;
+       int show = 0;
+       struct dso *dso = NULL;
+       struct thread *thread = threads__findnew(event->ip.pid);
+       __u64 ip = event->ip.ip;
+       struct map *map = NULL;
+
+       dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
+               (void *)(offset + head),
+               (void *)(long)(event->header.size),
+               event->header.misc,
+               event->ip.pid,
+               (void *)(long)ip);
+
+       dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
+       if (thread == NULL) {
+               fprintf(stderr, "problem processing %d event, skipping it.\n",
+                       event->header.type);
+               return -1;
+       }
+
+       if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
+               show = SHOW_KERNEL;
+               level = 'k';
+
+               dso = kernel_dso;
+
+               dprintf(" ...... dso: %s\n", dso->name);
+
+       } else if (event->header.misc & PERF_EVENT_MISC_USER) {
+
+               show = SHOW_USER;
+               level = '.';
+
+               map = thread__find_map(thread, ip);
+               if (map != NULL) {
+                       ip = map->map_ip(map, ip);
+                       dso = map->dso;
+               } else {
+                       /*
+                        * If this is outside of all known maps,
+                        * and is a negative address, try to look it
+                        * up in the kernel dso, as it might be a
+                        * vsyscall (which executes in user-mode):
+                        */
+                       if ((long long)ip < 0)
+                               dso = kernel_dso;
+               }
+               dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
+
+       } else {
+               show = SHOW_HV;
+               level = 'H';
+               dprintf(" ...... dso: [hypervisor]\n");
+       }
+
+       if (show & show_mask) {
+               struct symbol *sym = NULL;
+
+               if (dso)
+                       sym = dso->find_symbol(dso, ip);
+
+               if (hist_entry__add(thread, map, dso, sym, ip, level)) {
+                       fprintf(stderr,
+               "problem incrementing symbol count, skipping event\n");
+                       return -1;
+               }
+       }
+       total++;
+
+       return 0;
+}
+
+static int
+process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
+{
+       struct thread *thread = threads__findnew(event->mmap.pid);
+       struct map *map = map__new(&event->mmap);
+
+       dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n",
+               (void *)(offset + head),
+               (void *)(long)(event->header.size),
+               event->mmap.pid,
+               (void *)(long)event->mmap.start,
+               (void *)(long)event->mmap.len,
+               (void *)(long)event->mmap.pgoff,
+               event->mmap.filename);
+
+       if (thread == NULL || map == NULL) {
+               dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n");
+               return 0;
+       }
+
+       thread__insert_map(thread, map);
+       total_mmap++;
+
+       return 0;
+}
+
+static int
+process_comm_event(event_t *event, unsigned long offset, unsigned long head)
+{
+       struct thread *thread = threads__findnew(event->comm.pid);
+
+       dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+               (void *)(offset + head),
+               (void *)(long)(event->header.size),
+               event->comm.comm, event->comm.pid);
+
+       if (thread == NULL ||
+           thread__set_comm(thread, event->comm.comm)) {
+               dprintf("problem processing PERF_EVENT_COMM, skipping event.\n");
+               return -1;
+       }
+       total_comm++;
+
+       return 0;
+}
+
+static int
+process_fork_event(event_t *event, unsigned long offset, unsigned long head)
+{
+       struct thread *thread = threads__findnew(event->fork.pid);
+       struct thread *parent = threads__findnew(event->fork.ppid);
+
+       dprintf("%p [%p]: PERF_EVENT_FORK: %d:%d\n",
+               (void *)(offset + head),
+               (void *)(long)(event->header.size),
+               event->fork.pid, event->fork.ppid);
+
+       if (!thread || !parent || thread__fork(thread, parent)) {
+               dprintf("problem processing PERF_EVENT_FORK, skipping event.\n");
+               return -1;
+       }
+       total_fork++;
+
+       return 0;
+}
+
+static int
+process_period_event(event_t *event, unsigned long offset, unsigned long head)
+{
+       dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n",
+               (void *)(offset + head),
+               (void *)(long)(event->header.size),
+               event->period.time,
+               event->period.id,
+               event->period.sample_period);
+
+       return 0;
+}
+
+static int
+process_event(event_t *event, unsigned long offset, unsigned long head)
+{
+       if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
+               return process_overflow_event(event, offset, head);
+
+       switch (event->header.type) {
+       case PERF_EVENT_MMAP:
+               return process_mmap_event(event, offset, head);
+
+       case PERF_EVENT_COMM:
+               return process_comm_event(event, offset, head);
+
+       case PERF_EVENT_FORK:
+               return process_fork_event(event, offset, head);
+
+       case PERF_EVENT_PERIOD:
+               return process_period_event(event, offset, head);
+       /*
+        * We dont process them right now but they are fine:
+        */
+
+       case PERF_EVENT_THROTTLE:
+       case PERF_EVENT_UNTHROTTLE:
+               return 0;
+
+       default:
+               return -1;
+       }
+
+       return 0;
+}
+
+static int
+parse_line(FILE *file, struct symbol *sym, __u64 start, __u64 len)
+{
+       char *line = NULL, *tmp, *tmp2;
+       unsigned int offset;
+       size_t line_len;
+       __u64 line_ip;
+       int ret;
+       char *c;
+
+       if (getline(&line, &line_len, file) < 0)
+               return -1;
+       if (!line)
+               return -1;
+
+       c = strchr(line, '\n');
+       if (c)
+               *c = 0;
+
+       line_ip = -1;
+       offset = 0;
+       ret = -2;
+
+       /*
+        * Strip leading spaces:
+        */
+       tmp = line;
+       while (*tmp) {
+               if (*tmp != ' ')
+                       break;
+               tmp++;
+       }
+
+       if (*tmp) {
+               /*
+                * Parse hexa addresses followed by ':'
+                */
+               line_ip = strtoull(tmp, &tmp2, 16);
+               if (*tmp2 != ':')
+                       line_ip = -1;
+       }
+
+       if (line_ip != -1) {
+               unsigned int hits = 0;
+               double percent = 0.0;
+               char *color = PERF_COLOR_NORMAL;
+
+               offset = line_ip - start;
+               if (offset < len)
+                       hits = sym->hist[offset];
+
+               if (sym->hist_sum)
+                       percent = 100.0 * hits / sym->hist_sum;
+
+               /*
+                * We color high-overhead entries in red, mid-overhead
+                * entries in green - and keep the low overhead places
+                * normal:
+                */
+               if (percent >= 5.0)
+                       color = PERF_COLOR_RED;
+               else {
+                       if (percent > 0.5)
+                               color = PERF_COLOR_GREEN;
+               }
+
+               color_fprintf(stdout, color, " %7.2f", percent);
+               printf(" :      ");
+               color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", line);
+       } else {
+               if (!*line)
+                       printf("         :\n");
+               else
+                       printf("         :      %s\n", line);
+       }
+
+       return 0;
+}
+
+static void annotate_sym(struct dso *dso, struct symbol *sym)
+{
+       char *filename = dso->name;
+       __u64 start, end, len;
+       char command[PATH_MAX*2];
+       FILE *file;
+
+       if (!filename)
+               return;
+       if (dso == kernel_dso)
+               filename = vmlinux;
+
+       printf("\n------------------------------------------------\n");
+       printf(" Percent |      Source code & Disassembly of %s\n", filename);
+       printf("------------------------------------------------\n");
+
+       if (verbose >= 2)
+               printf("annotating [%p] %30s : [%p] %30s\n", dso, dso->name, sym, sym->name);
+
+       start = sym->obj_start;
+       if (!start)
+               start = sym->start;
+
+       end = start + sym->end - sym->start + 1;
+       len = sym->end - sym->start;
+
+       sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s", (__u64)start, (__u64)end, filename);
+
+       if (verbose >= 3)
+               printf("doing: %s\n", command);
+
+       file = popen(command, "r");
+       if (!file)
+               return;
+
+       while (!feof(file)) {
+               if (parse_line(file, sym, start, len) < 0)
+                       break;
+       }
+
+       pclose(file);
+}
+
+static void find_annotations(void)
+{
+       struct rb_node *nd;
+       struct dso *dso;
+       int count = 0;
+
+       list_for_each_entry(dso, &dsos, node) {
+
+               for (nd = rb_first(&dso->syms); nd; nd = rb_next(nd)) {
+                       struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
+
+                       if (sym->hist) {
+                               annotate_sym(dso, sym);
+                               count++;
+                       }
+               }
+       }
+
+       if (!count)
+               printf(" Error: symbol '%s' not present amongst the samples.\n", sym_hist_filter);
+}
+
+static int __cmd_annotate(void)
+{
+       int ret, rc = EXIT_FAILURE;
+       unsigned long offset = 0;
+       unsigned long head = 0;
+       struct stat stat;
+       event_t *event;
+       uint32_t size;
+       char *buf;
+
+       register_idle_thread();
+
+       input = open(input_name, O_RDONLY);
+       if (input < 0) {
+               perror("failed to open file");
+               exit(-1);
+       }
+
+       ret = fstat(input, &stat);
+       if (ret < 0) {
+               perror("failed to stat file");
+               exit(-1);
+       }
+
+       if (!stat.st_size) {
+               fprintf(stderr, "zero-sized file, nothing to do!\n");
+               exit(0);
+       }
+
+       if (load_kernel() < 0) {
+               perror("failed to load kernel symbols");
+               return EXIT_FAILURE;
+       }
+
+remap:
+       buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
+                          MAP_SHARED, input, offset);
+       if (buf == MAP_FAILED) {
+               perror("failed to mmap file");
+               exit(-1);
+       }
+
+more:
+       event = (event_t *)(buf + head);
+
+       size = event->header.size;
+       if (!size)
+               size = 8;
+
+       if (head + event->header.size >= page_size * mmap_window) {
+               unsigned long shift = page_size * (head / page_size);
+               int ret;
+
+               ret = munmap(buf, page_size * mmap_window);
+               assert(ret == 0);
+
+               offset += shift;
+               head -= shift;
+               goto remap;
+       }
+
+       size = event->header.size;
+
+       dprintf("%p [%p]: event: %d\n",
+                       (void *)(offset + head),
+                       (void *)(long)event->header.size,
+                       event->header.type);
+
+       if (!size || process_event(event, offset, head) < 0) {
+
+               dprintf("%p [%p]: skipping unknown header type: %d\n",
+                       (void *)(offset + head),
+                       (void *)(long)(event->header.size),
+                       event->header.type);
+
+               total_unknown++;
+
+               /*
+                * assume we lost track of the stream, check alignment, and
+                * increment a single u64 in the hope to catch on again 'soon'.
+                */
+
+               if (unlikely(head & 7))
+                       head &= ~7ULL;
+
+               size = 8;
+       }
+
+       head += size;
+
+       if (offset + head < stat.st_size)
+               goto more;
+
+       rc = EXIT_SUCCESS;
+       close(input);
+
+       dprintf("      IP events: %10ld\n", total);
+       dprintf("    mmap events: %10ld\n", total_mmap);
+       dprintf("    comm events: %10ld\n", total_comm);
+       dprintf("    fork events: %10ld\n", total_fork);
+       dprintf(" unknown events: %10ld\n", total_unknown);
+
+       if (dump_trace)
+               return 0;
+
+       if (verbose >= 3)
+               threads__fprintf(stdout);
+
+       if (verbose >= 2)
+               dsos__fprintf(stdout);
+
+       collapse__resort();
+       output__resort();
+
+       find_annotations();
+
+       return rc;
+}
+
+static const char * const annotate_usage[] = {
+       "perf annotate [<options>] <command>",
+       NULL
+};
+
+static const struct option options[] = {
+       OPT_STRING('i', "input", &input_name, "file",
+                   "input file name"),
+       OPT_STRING('s', "symbol", &sym_hist_filter, "symbol",
+                   "symbol to annotate"),
+       OPT_BOOLEAN('v', "verbose", &verbose,
+                   "be more verbose (show symbol address, etc)"),
+       OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+                   "dump raw trace in ASCII"),
+       OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
+       OPT_END()
+};
+
+static void setup_sorting(void)
+{
+       char *tmp, *tok, *str = strdup(sort_order);
+
+       for (tok = strtok_r(str, ", ", &tmp);
+                       tok; tok = strtok_r(NULL, ", ", &tmp)) {
+               if (sort_dimension__add(tok) < 0) {
+                       error("Unknown --sort key: `%s'", tok);
+                       usage_with_options(annotate_usage, options);
+               }
+       }
+
+       free(str);
+}
+
+int cmd_annotate(int argc, const char **argv, const char *prefix)
+{
+       symbol__init();
+
+       page_size = getpagesize();
+
+       argc = parse_options(argc, argv, options, annotate_usage, 0);
+
+       setup_sorting();
+
+       if (argc) {
+               /*
+                * Special case: if there's an argument left then assume tha
+                * it's a symbol filter:
+                */
+               if (argc > 1)
+                       usage_with_options(annotate_usage, options);
+
+               sym_hist_filter = argv[0];
+       }
+
+       if (!sym_hist_filter)
+               usage_with_options(annotate_usage, options);
+
+       setup_pager();
+
+       return __cmd_annotate();
+}
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c

new file mode 100644 (file)

index 0000000..0f32dc3
--- /dev/null
+++ b/tools/perf/builtin-help.c
@@ -0,0 +1,461 @@
+/*
+ * builtin-help.c
+ *
+ * Builtin help command
+ */
+#include "util/cache.h"
+#include "builtin.h"
+#include "util/exec_cmd.h"
+#include "common-cmds.h"
+#include "util/parse-options.h"
+#include "util/run-command.h"
+#include "util/help.h"
+
+static struct man_viewer_list {
+       struct man_viewer_list *next;
+       char name[FLEX_ARRAY];
+} *man_viewer_list;
+
+static struct man_viewer_info_list {
+       struct man_viewer_info_list *next;
+       const char *info;
+       char name[FLEX_ARRAY];
+} *man_viewer_info_list;
+
+enum help_format {
+       HELP_FORMAT_MAN,
+       HELP_FORMAT_INFO,
+       HELP_FORMAT_WEB,
+};
+
+static int show_all = 0;
+static enum help_format help_format = HELP_FORMAT_MAN;
+static struct option builtin_help_options[] = {
+       OPT_BOOLEAN('a', "all", &show_all, "print all available commands"),
+       OPT_SET_INT('m', "man", &help_format, "show man page", HELP_FORMAT_MAN),
+       OPT_SET_INT('w', "web", &help_format, "show manual in web browser",
+                       HELP_FORMAT_WEB),
+       OPT_SET_INT('i', "info", &help_format, "show info page",
+                       HELP_FORMAT_INFO),
+       OPT_END(),
+};
+
+static const char * const builtin_help_usage[] = {
+       "perf help [--all] [--man|--web|--info] [command]",
+       NULL
+};
+
+static enum help_format parse_help_format(const char *format)
+{
+       if (!strcmp(format, "man"))
+               return HELP_FORMAT_MAN;
+       if (!strcmp(format, "info"))
+               return HELP_FORMAT_INFO;
+       if (!strcmp(format, "web") || !strcmp(format, "html"))
+               return HELP_FORMAT_WEB;
+       die("unrecognized help format '%s'", format);
+}
+
+static const char *get_man_viewer_info(const char *name)
+{
+       struct man_viewer_info_list *viewer;
+
+       for (viewer = man_viewer_info_list; viewer; viewer = viewer->next)
+       {
+               if (!strcasecmp(name, viewer->name))
+                       return viewer->info;
+       }
+       return NULL;
+}
+
+static int check_emacsclient_version(void)
+{
+       struct strbuf buffer = STRBUF_INIT;
+       struct child_process ec_process;
+       const char *argv_ec[] = { "emacsclient", "--version", NULL };
+       int version;
+
+       /* emacsclient prints its version number on stderr */
+       memset(&ec_process, 0, sizeof(ec_process));
+       ec_process.argv = argv_ec;
+       ec_process.err = -1;
+       ec_process.stdout_to_stderr = 1;
+       if (start_command(&ec_process)) {
+               fprintf(stderr, "Failed to start emacsclient.\n");
+               return -1;
+       }
+       strbuf_read(&buffer, ec_process.err, 20);
+       close(ec_process.err);
+
+       /*
+        * Don't bother checking return value, because "emacsclient --version"
+        * seems to always exits with code 1.
+        */
+       finish_command(&ec_process);
+
+       if (prefixcmp(buffer.buf, "emacsclient")) {
+               fprintf(stderr, "Failed to parse emacsclient version.\n");
+               strbuf_release(&buffer);
+               return -1;
+       }
+
+       strbuf_remove(&buffer, 0, strlen("emacsclient"));
+       version = atoi(buffer.buf);
+
+       if (version < 22) {
+               fprintf(stderr,
+                       "emacsclient version '%d' too old (< 22).\n",
+                       version);
+               strbuf_release(&buffer);
+               return -1;
+       }
+
+       strbuf_release(&buffer);
+       return 0;
+}
+
+static void exec_woman_emacs(const char* path, const char *page)
+{
+       if (!check_emacsclient_version()) {
+               /* This works only with emacsclient version >= 22. */
+               struct strbuf man_page = STRBUF_INIT;
+
+               if (!path)
+                       path = "emacsclient";
+               strbuf_addf(&man_page, "(woman \"%s\")", page);
+               execlp(path, "emacsclient", "-e", man_page.buf, NULL);
+               warning("failed to exec '%s': %s", path, strerror(errno));
+       }
+}
+
+static void exec_man_konqueror(const char* path, const char *page)
+{
+       const char *display = getenv("DISPLAY");
+       if (display && *display) {
+               struct strbuf man_page = STRBUF_INIT;
+               const char *filename = "kfmclient";
+
+               /* It's simpler to launch konqueror using kfmclient. */
+               if (path) {
+                       const char *file = strrchr(path, '/');
+                       if (file && !strcmp(file + 1, "konqueror")) {
+                               char *new = strdup(path);
+                               char *dest = strrchr(new, '/');
+
+                               /* strlen("konqueror") == strlen("kfmclient") */
+                               strcpy(dest + 1, "kfmclient");
+                               path = new;
+                       }
+                       if (file)
+                               filename = file;
+               } else
+                       path = "kfmclient";
+               strbuf_addf(&man_page, "man:%s(1)", page);
+               execlp(path, filename, "newTab", man_page.buf, NULL);
+               warning("failed to exec '%s': %s", path, strerror(errno));
+       }
+}
+
+static void exec_man_man(const char* path, const char *page)
+{
+       if (!path)
+               path = "man";
+       execlp(path, "man", page, NULL);
+       warning("failed to exec '%s': %s", path, strerror(errno));
+}
+
+static void exec_man_cmd(const char *cmd, const char *page)
+{
+       struct strbuf shell_cmd = STRBUF_INIT;
+       strbuf_addf(&shell_cmd, "%s %s", cmd, page);
+       execl("/bin/sh", "sh", "-c", shell_cmd.buf, NULL);
+       warning("failed to exec '%s': %s", cmd, strerror(errno));
+}
+
+static void add_man_viewer(const char *name)
+{
+       struct man_viewer_list **p = &man_viewer_list;
+       size_t len = strlen(name);
+
+       while (*p)
+               p = &((*p)->next);
+       *p = calloc(1, (sizeof(**p) + len + 1));
+       strncpy((*p)->name, name, len);
+}
+
+static int supported_man_viewer(const char *name, size_t len)
+{
+       return (!strncasecmp("man", name, len) ||
+               !strncasecmp("woman", name, len) ||
+               !strncasecmp("konqueror", name, len));
+}
+
+static void do_add_man_viewer_info(const char *name,
+                                  size_t len,
+                                  const char *value)
+{
+       struct man_viewer_info_list *new = calloc(1, sizeof(*new) + len + 1);
+
+       strncpy(new->name, name, len);
+       new->info = strdup(value);
+       new->next = man_viewer_info_list;
+       man_viewer_info_list = new;
+}
+
+static int add_man_viewer_path(const char *name,
+                              size_t len,
+                              const char *value)
+{
+       if (supported_man_viewer(name, len))
+               do_add_man_viewer_info(name, len, value);
+       else
+               warning("'%s': path for unsupported man viewer.\n"
+                       "Please consider using 'man.<tool>.cmd' instead.",
+                       name);
+
+       return 0;
+}
+
+static int add_man_viewer_cmd(const char *name,
+                             size_t len,
+                             const char *value)
+{
+       if (supported_man_viewer(name, len))
+               warning("'%s': cmd for supported man viewer.\n"
+                       "Please consider using 'man.<tool>.path' instead.",
+                       name);
+       else
+               do_add_man_viewer_info(name, len, value);
+
+       return 0;
+}
+
+static int add_man_viewer_info(const char *var, const char *value)
+{
+       const char *name = var + 4;
+       const char *subkey = strrchr(name, '.');
+
+       if (!subkey)
+               return error("Config with no key for man viewer: %s", name);
+
+       if (!strcmp(subkey, ".path")) {
+               if (!value)
+                       return config_error_nonbool(var);
+               return add_man_viewer_path(name, subkey - name, value);
+       }
+       if (!strcmp(subkey, ".cmd")) {
+               if (!value)
+                       return config_error_nonbool(var);
+               return add_man_viewer_cmd(name, subkey - name, value);
+       }
+
+       warning("'%s': unsupported man viewer sub key.", subkey);
+       return 0;
+}
+
+static int perf_help_config(const char *var, const char *value, void *cb)
+{
+       if (!strcmp(var, "help.format")) {
+               if (!value)
+                       return config_error_nonbool(var);
+               help_format = parse_help_format(value);
+               return 0;
+       }
+       if (!strcmp(var, "man.viewer")) {
+               if (!value)
+                       return config_error_nonbool(var);
+               add_man_viewer(value);
+               return 0;
+       }
+       if (!prefixcmp(var, "man."))
+               return add_man_viewer_info(var, value);
+
+       return perf_default_config(var, value, cb);
+}
+
+static struct cmdnames main_cmds, other_cmds;
+
+void list_common_cmds_help(void)
+{
+       int i, longest = 0;
+
+       for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+               if (longest < strlen(common_cmds[i].name))
+                       longest = strlen(common_cmds[i].name);
+       }
+
+       puts(" The most commonly used perf commands are:");
+       for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+               printf("   %s   ", common_cmds[i].name);
+               mput_char(' ', longest - strlen(common_cmds[i].name));
+               puts(common_cmds[i].help);
+       }
+}
+
+static int is_perf_command(const char *s)
+{
+       return is_in_cmdlist(&main_cmds, s) ||
+               is_in_cmdlist(&other_cmds, s);
+}
+
+static const char *prepend(const char *prefix, const char *cmd)
+{
+       size_t pre_len = strlen(prefix);
+       size_t cmd_len = strlen(cmd);
+       char *p = malloc(pre_len + cmd_len + 1);
+       memcpy(p, prefix, pre_len);
+       strcpy(p + pre_len, cmd);
+       return p;
+}
+
+static const char *cmd_to_page(const char *perf_cmd)
+{
+       if (!perf_cmd)
+               return "perf";
+       else if (!prefixcmp(perf_cmd, "perf"))
+               return perf_cmd;
+       else if (is_perf_command(perf_cmd))
+               return prepend("perf-", perf_cmd);
+       else
+               return prepend("perf-", perf_cmd);
+}
+
+static void setup_man_path(void)
+{
+       struct strbuf new_path = STRBUF_INIT;
+       const char *old_path = getenv("MANPATH");
+
+       /* We should always put ':' after our path. If there is no
+        * old_path, the ':' at the end will let 'man' to try
+        * system-wide paths after ours to find the manual page. If
+        * there is old_path, we need ':' as delimiter. */
+       strbuf_addstr(&new_path, system_path(PERF_MAN_PATH));
+       strbuf_addch(&new_path, ':');
+       if (old_path)
+               strbuf_addstr(&new_path, old_path);
+
+       setenv("MANPATH", new_path.buf, 1);
+
+       strbuf_release(&new_path);
+}
+
+static void exec_viewer(const char *name, const char *page)
+{
+       const char *info = get_man_viewer_info(name);
+
+       if (!strcasecmp(name, "man"))
+               exec_man_man(info, page);
+       else if (!strcasecmp(name, "woman"))
+               exec_woman_emacs(info, page);
+       else if (!strcasecmp(name, "konqueror"))
+               exec_man_konqueror(info, page);
+       else if (info)
+               exec_man_cmd(info, page);
+       else
+               warning("'%s': unknown man viewer.", name);
+}
+
+static void show_man_page(const char *perf_cmd)
+{
+       struct man_viewer_list *viewer;
+       const char *page = cmd_to_page(perf_cmd);
+       const char *fallback = getenv("PERF_MAN_VIEWER");
+
+       setup_man_path();
+       for (viewer = man_viewer_list; viewer; viewer = viewer->next)
+       {
+               exec_viewer(viewer->name, page); /* will return when unable */
+       }
+       if (fallback)
+               exec_viewer(fallback, page);
+       exec_viewer("man", page);
+       die("no man viewer handled the request");
+}
+
+static void show_info_page(const char *perf_cmd)
+{
+       const char *page = cmd_to_page(perf_cmd);
+       setenv("INFOPATH", system_path(PERF_INFO_PATH), 1);
+       execlp("info", "info", "perfman", page, NULL);
+}
+
+static void get_html_page_path(struct strbuf *page_path, const char *page)
+{
+       struct stat st;
+       const char *html_path = system_path(PERF_HTML_PATH);
+
+       /* Check that we have a perf documentation directory. */
+       if (stat(mkpath("%s/perf.html", html_path), &st)
+           || !S_ISREG(st.st_mode))
+               die("'%s': not a documentation directory.", html_path);
+
+       strbuf_init(page_path, 0);
+       strbuf_addf(page_path, "%s/%s.html", html_path, page);
+}
+
+/*
+ * If open_html is not defined in a platform-specific way (see for
+ * example compat/mingw.h), we use the script web--browse to display
+ * HTML.
+ */
+#ifndef open_html
+static void open_html(const char *path)
+{
+       execl_perf_cmd("web--browse", "-c", "help.browser", path, NULL);
+}
+#endif
+
+static void show_html_page(const char *perf_cmd)
+{
+       const char *page = cmd_to_page(perf_cmd);
+       struct strbuf page_path; /* it leaks but we exec bellow */
+
+       get_html_page_path(&page_path, page);
+
+       open_html(page_path.buf);
+}
+
+int cmd_help(int argc, const char **argv, const char *prefix)
+{
+       const char *alias;
+       load_command_list("perf-", &main_cmds, &other_cmds);
+
+       perf_config(perf_help_config, NULL);
+
+       argc = parse_options(argc, argv, builtin_help_options,
+                       builtin_help_usage, 0);
+
+       if (show_all) {
+               printf("\n usage: %s\n\n", perf_usage_string);
+               list_commands("perf commands", &main_cmds, &other_cmds);
+               printf(" %s\n\n", perf_more_info_string);
+               return 0;
+       }
+
+       if (!argv[0]) {
+               printf("\n usage: %s\n\n", perf_usage_string);
+               list_common_cmds_help();
+               printf("\n %s\n\n", perf_more_info_string);
+               return 0;
+       }
+
+       alias = alias_lookup(argv[0]);
+       if (alias && !is_perf_command(argv[0])) {
+               printf("`perf %s' is aliased to `%s'\n", argv[0], alias);
+               return 0;
+       }
+
+       switch (help_format) {
+       case HELP_FORMAT_MAN:
+               show_man_page(argv[0]);
+               break;
+       case HELP_FORMAT_INFO:
+               show_info_page(argv[0]);
+               break;
+       case HELP_FORMAT_WEB:
+               show_html_page(argv[0]);
+               break;
+       }
+
+       return 0;
+}
diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c

new file mode 100644 (file)

index 0000000..fe60e37
--- /dev/null
+++ b/tools/perf/builtin-list.c
@@ -0,0 +1,20 @@
+/*
+ * builtin-list.c
+ *
+ * Builtin list command: list all event types
+ *
+ * Copyright (C) 2009, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2009, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+ */
+#include "builtin.h"
+
+#include "perf.h"
+
+#include "util/parse-options.h"
+#include "util/parse-events.h"
+
+int cmd_list(int argc, const char **argv, const char *prefix)
+{
+       print_events();
+       return 0;
+}
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c

new file mode 100644 (file)

index 0000000..29259e7
--- /dev/null
+++ b/tools/perf/builtin-record.c
@@ -0,0 +1,582 @@
+/*
+ * builtin-record.c
+ *
+ * Builtin record command: Record the profile of a workload
+ * (or a CPU, or a PID) into the perf.data output file - for
+ * later analysis via perf report.
+ */
+#include "builtin.h"
+
+#include "perf.h"
+
+#include "util/util.h"
+#include "util/parse-options.h"
+#include "util/parse-events.h"
+#include "util/string.h"
+
+#include <unistd.h>
+#include <sched.h>
+
+#define ALIGN(x, a)            __ALIGN_MASK(x, (typeof(x))(a)-1)
+#define __ALIGN_MASK(x, mask)  (((x)+(mask))&~(mask))
+
+static int                     fd[MAX_NR_CPUS][MAX_COUNTERS];
+
+static long                    default_interval                = 100000;
+
+static int                     nr_cpus                         = 0;
+static unsigned int            page_size;
+static unsigned int            mmap_pages                      = 128;
+static int                     freq                            = 0;
+static int                     output;
+static const char              *output_name                    = "perf.data";
+static int                     group                           = 0;
+static unsigned int            realtime_prio                   = 0;
+static int                     system_wide                     = 0;
+static pid_t                   target_pid                      = -1;
+static int                     inherit                         = 1;
+static int                     force                           = 0;
+static int                     append_file                     = 0;
+static int                     verbose                         = 0;
+
+static long                    samples;
+static struct timeval          last_read;
+static struct timeval          this_read;
+
+static __u64                   bytes_written;
+
+static struct pollfd           event_array[MAX_NR_CPUS * MAX_COUNTERS];
+
+static int                     nr_poll;
+static int                     nr_cpu;
+
+struct mmap_event {
+       struct perf_event_header        header;
+       __u32                           pid;
+       __u32                           tid;
+       __u64                           start;
+       __u64                           len;
+       __u64                           pgoff;
+       char                            filename[PATH_MAX];
+};
+
+struct comm_event {
+       struct perf_event_header        header;
+       __u32                           pid;
+       __u32                           tid;
+       char                            comm[16];
+};
+
+
+struct mmap_data {
+       int                     counter;
+       void                    *base;
+       unsigned int            mask;
+       unsigned int            prev;
+};
+
+static struct mmap_data                mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
+
+static unsigned int mmap_read_head(struct mmap_data *md)
+{
+       struct perf_counter_mmap_page *pc = md->base;
+       int head;
+
+       head = pc->data_head;
+       rmb();
+
+       return head;
+}
+
+static void mmap_read(struct mmap_data *md)
+{
+       unsigned int head = mmap_read_head(md);
+       unsigned int old = md->prev;
+       unsigned char *data = md->base + page_size;
+       unsigned long size;
+       void *buf;
+       int diff;
+
+       gettimeofday(&this_read, NULL);
+
+       /*
+        * If we're further behind than half the buffer, there's a chance
+        * the writer will bite our tail and mess up the samples under us.
+        *
+        * If we somehow ended up ahead of the head, we got messed up.
+        *
+        * In either case, truncate and restart at head.
+        */
+       diff = head - old;
+       if (diff > md->mask / 2 || diff < 0) {
+               struct timeval iv;
+               unsigned long msecs;
+
+               timersub(&this_read, &last_read, &iv);
+               msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
+
+               fprintf(stderr, "WARNING: failed to keep up with mmap data."
+                               "  Last read %lu msecs ago.\n", msecs);
+
+               /*
+                * head points to a known good entry, start there.
+                */
+               old = head;
+       }
+
+       last_read = this_read;
+
+       if (old != head)
+               samples++;
+
+       size = head - old;
+
+       if ((old & md->mask) + size != (head & md->mask)) {
+               buf = &data[old & md->mask];
+               size = md->mask + 1 - (old & md->mask);
+               old += size;
+
+               while (size) {
+                       int ret = write(output, buf, size);
+
+                       if (ret < 0)
+                               die("failed to write");
+
+                       size -= ret;
+                       buf += ret;
+
+                       bytes_written += ret;
+               }
+       }
+
+       buf = &data[old & md->mask];
+       size = head - old;
+       old += size;
+
+       while (size) {
+               int ret = write(output, buf, size);
+
+               if (ret < 0)
+                       die("failed to write");
+
+               size -= ret;
+               buf += ret;
+
+               bytes_written += ret;
+       }
+
+       md->prev = old;
+}
+
+static volatile int done = 0;
+static volatile int signr = -1;
+
+static void sig_handler(int sig)
+{
+       done = 1;
+       signr = sig;
+}
+
+static void sig_atexit(void)
+{
+       if (signr == -1)
+               return;
+
+       signal(signr, SIG_DFL);
+       kill(getpid(), signr);
+}
+
+static void pid_synthesize_comm_event(pid_t pid, int full)
+{
+       struct comm_event comm_ev;
+       char filename[PATH_MAX];
+       char bf[BUFSIZ];
+       int fd, ret;
+       size_t size;
+       char *field, *sep;
+       DIR *tasks;
+       struct dirent dirent, *next;
+
+       snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);
+
+       fd = open(filename, O_RDONLY);
+       if (fd < 0) {
+               fprintf(stderr, "couldn't open %s\n", filename);
+               exit(EXIT_FAILURE);
+       }
+       if (read(fd, bf, sizeof(bf)) < 0) {
+               fprintf(stderr, "couldn't read %s\n", filename);
+               exit(EXIT_FAILURE);
+       }
+       close(fd);
+
+       /* 9027 (cat) R 6747 9027 6747 34816 9027 ... */
+       memset(&comm_ev, 0, sizeof(comm_ev));
+       field = strchr(bf, '(');
+       if (field == NULL)
+               goto out_failure;
+       sep = strchr(++field, ')');
+       if (sep == NULL)
+               goto out_failure;
+       size = sep - field;
+       memcpy(comm_ev.comm, field, size++);
+
+       comm_ev.pid = pid;
+       comm_ev.header.type = PERF_EVENT_COMM;
+       size = ALIGN(size, sizeof(__u64));
+       comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
+
+       if (!full) {
+               comm_ev.tid = pid;
+
+               ret = write(output, &comm_ev, comm_ev.header.size);
+               if (ret < 0) {
+                       perror("failed to write");
+                       exit(-1);
+               }
+               return;
+       }
+
+       snprintf(filename, sizeof(filename), "/proc/%d/task", pid);
+
+       tasks = opendir(filename);
+       while (!readdir_r(tasks, &dirent, &next) && next) {
+               char *end;
+               pid = strtol(dirent.d_name, &end, 10);
+               if (*end)
+                       continue;
+
+               comm_ev.tid = pid;
+
+               ret = write(output, &comm_ev, comm_ev.header.size);
+               if (ret < 0) {
+                       perror("failed to write");
+                       exit(-1);
+               }
+       }
+       closedir(tasks);
+       return;
+
+out_failure:
+       fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
+               filename);
+       exit(EXIT_FAILURE);
+}
+
+static void pid_synthesize_mmap_samples(pid_t pid)
+{
+       char filename[PATH_MAX];
+       FILE *fp;
+
+       snprintf(filename, sizeof(filename), "/proc/%d/maps", pid);
+
+       fp = fopen(filename, "r");
+       if (fp == NULL) {
+               fprintf(stderr, "couldn't open %s\n", filename);
+               exit(EXIT_FAILURE);
+       }
+       while (1) {
+               char bf[BUFSIZ], *pbf = bf;
+               struct mmap_event mmap_ev = {
+                       .header.type = PERF_EVENT_MMAP,
+               };
+               int n;
+               size_t size;
+               if (fgets(bf, sizeof(bf), fp) == NULL)
+                       break;
+
+               /* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
+               n = hex2u64(pbf, &mmap_ev.start);
+               if (n < 0)
+                       continue;
+               pbf += n + 1;
+               n = hex2u64(pbf, &mmap_ev.len);
+               if (n < 0)
+                       continue;
+               pbf += n + 3;
+               if (*pbf == 'x') { /* vm_exec */
+                       char *execname = strrchr(bf, ' ');
+
+                       if (execname == NULL || execname[1] != '/')
+                               continue;
+
+                       execname += 1;
+                       size = strlen(execname);
+                       execname[size - 1] = '\0'; /* Remove \n */
+                       memcpy(mmap_ev.filename, execname, size);
+                       size = ALIGN(size, sizeof(__u64));
+                       mmap_ev.len -= mmap_ev.start;
+                       mmap_ev.header.size = (sizeof(mmap_ev) -
+                                              (sizeof(mmap_ev.filename) - size));
+                       mmap_ev.pid = pid;
+                       mmap_ev.tid = pid;
+
+                       if (write(output, &mmap_ev, mmap_ev.header.size) < 0) {
+                               perror("failed to write");
+                               exit(-1);
+                       }
+               }
+       }
+
+       fclose(fp);
+}
+
+static void synthesize_samples(void)
+{
+       DIR *proc;
+       struct dirent dirent, *next;
+
+       proc = opendir("/proc");
+
+       while (!readdir_r(proc, &dirent, &next) && next) {
+               char *end;
+               pid_t pid;
+
+               pid = strtol(dirent.d_name, &end, 10);
+               if (*end) /* only interested in proper numerical dirents */
+                       continue;
+
+               pid_synthesize_comm_event(pid, 1);
+               pid_synthesize_mmap_samples(pid);
+       }
+
+       closedir(proc);
+}
+
+static int group_fd;
+
+static void create_counter(int counter, int cpu, pid_t pid)
+{
+       struct perf_counter_attr *attr = attrs + counter;
+       int track = 1;
+
+       attr->sample_type       = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+       if (freq) {
+               attr->sample_type       |= PERF_SAMPLE_PERIOD;
+               attr->freq              = 1;
+               attr->sample_freq       = freq;
+       }
+       attr->mmap              = track;
+       attr->comm              = track;
+       attr->inherit           = (cpu < 0) && inherit;
+       attr->disabled          = 1;
+
+       track = 0; /* only the first counter needs these */
+
+try_again:
+       fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
+
+       if (fd[nr_cpu][counter] < 0) {
+               int err = errno;
+
+               if (err == EPERM)
+                       die("Permission error - are you root?\n");
+
+               /*
+                * If it's cycles then fall back to hrtimer
+                * based cpu-clock-tick sw counter, which
+                * is always available even if no PMU support:
+                */
+               if (attr->type == PERF_TYPE_HARDWARE
+                       && attr->config == PERF_COUNT_HW_CPU_CYCLES) {
+
+                       if (verbose)
+                               warning(" ... trying to fall back to cpu-clock-ticks\n");
+                       attr->type = PERF_TYPE_SOFTWARE;
+                       attr->config = PERF_COUNT_SW_CPU_CLOCK;
+                       goto try_again;
+               }
+               printf("\n");
+               error("perfcounter syscall returned with %d (%s)\n",
+                       fd[nr_cpu][counter], strerror(err));
+               die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
+               exit(-1);
+       }
+
+       assert(fd[nr_cpu][counter] >= 0);
+       fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
+
+       /*
+        * First counter acts as the group leader:
+        */
+       if (group && group_fd == -1)
+               group_fd = fd[nr_cpu][counter];
+
+       event_array[nr_poll].fd = fd[nr_cpu][counter];
+       event_array[nr_poll].events = POLLIN;
+       nr_poll++;
+
+       mmap_array[nr_cpu][counter].counter = counter;
+       mmap_array[nr_cpu][counter].prev = 0;
+       mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
+       mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
+                       PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
+       if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
+               error("failed to mmap with %d (%s)\n", errno, strerror(errno));
+               exit(-1);
+       }
+
+       ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE);
+}
+
+static void open_counters(int cpu, pid_t pid)
+{
+       int counter;
+
+       if (pid > 0) {
+               pid_synthesize_comm_event(pid, 0);
+               pid_synthesize_mmap_samples(pid);
+       }
+
+       group_fd = -1;
+       for (counter = 0; counter < nr_counters; counter++)
+               create_counter(counter, cpu, pid);
+
+       nr_cpu++;
+}
+
+static int __cmd_record(int argc, const char **argv)
+{
+       int i, counter;
+       struct stat st;
+       pid_t pid;
+       int flags;
+       int ret;
+
+       page_size = sysconf(_SC_PAGE_SIZE);
+       nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+       assert(nr_cpus <= MAX_NR_CPUS);
+       assert(nr_cpus >= 0);
+
+       if (!stat(output_name, &st) && !force && !append_file) {
+               fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n",
+                               output_name);
+               exit(-1);
+       }
+
+       flags = O_CREAT|O_RDWR;
+       if (append_file)
+               flags |= O_APPEND;
+       else
+               flags |= O_TRUNC;
+
+       output = open(output_name, flags, S_IRUSR|S_IWUSR);
+       if (output < 0) {
+               perror("failed to create output file");
+               exit(-1);
+       }
+
+       if (!system_wide) {
+               open_counters(-1, target_pid != -1 ? target_pid : getpid());
+       } else for (i = 0; i < nr_cpus; i++)
+               open_counters(i, target_pid);
+
+       atexit(sig_atexit);
+       signal(SIGCHLD, sig_handler);
+       signal(SIGINT, sig_handler);
+
+       if (target_pid == -1 && argc) {
+               pid = fork();
+               if (pid < 0)
+                       perror("failed to fork");
+
+               if (!pid) {
+                       if (execvp(argv[0], (char **)argv)) {
+                               perror(argv[0]);
+                               exit(-1);
+                       }
+               }
+       }
+
+       if (realtime_prio) {
+               struct sched_param param;
+
+               param.sched_priority = realtime_prio;
+               if (sched_setscheduler(0, SCHED_FIFO, &param)) {
+                       printf("Could not set realtime priority.\n");
+                       exit(-1);
+               }
+       }
+
+       if (system_wide)
+               synthesize_samples();
+
+       while (!done) {
+               int hits = samples;
+
+               for (i = 0; i < nr_cpu; i++) {
+                       for (counter = 0; counter < nr_counters; counter++)
+                               mmap_read(&mmap_array[i][counter]);
+               }
+
+               if (hits == samples)
+                       ret = poll(event_array, nr_poll, 100);
+       }
+
+       /*
+        * Approximate RIP event size: 24 bytes.
+        */
+       fprintf(stderr,
+               "[ perf record: Captured and wrote %.3f MB %s (~%lld samples) ]\n",
+               (double)bytes_written / 1024.0 / 1024.0,
+               output_name,
+               bytes_written / 24);
+
+       return 0;
+}
+
+static const char * const record_usage[] = {
+       "perf record [<options>] [<command>]",
+       "perf record [<options>] -- <command> [<options>]",
+       NULL
+};
+
+static const struct option options[] = {
+       OPT_CALLBACK('e', "event", NULL, "event",
+                    "event selector. use 'perf list' to list available events",
+                    parse_events),
+       OPT_INTEGER('p', "pid", &target_pid,
+                   "record events on existing pid"),
+       OPT_INTEGER('r', "realtime", &realtime_prio,
+                   "collect data with this RT SCHED_FIFO priority"),
+       OPT_BOOLEAN('a', "all-cpus", &system_wide,
+                           "system-wide collection from all CPUs"),
+       OPT_BOOLEAN('A', "append", &append_file,
+                           "append to the output file to do incremental profiling"),
+       OPT_BOOLEAN('f', "force", &force,
+                       "overwrite existing data file"),
+       OPT_LONG('c', "count", &default_interval,
+                   "event period to sample"),
+       OPT_STRING('o', "output", &output_name, "file",
+                   "output file name"),
+       OPT_BOOLEAN('i', "inherit", &inherit,
+                   "child tasks inherit counters"),
+       OPT_INTEGER('F', "freq", &freq,
+                   "profile at this frequency"),
+       OPT_INTEGER('m', "mmap-pages", &mmap_pages,
+                   "number of mmap data pages"),
+       OPT_BOOLEAN('v', "verbose", &verbose,
+                   "be more verbose (show counter open errors, etc)"),
+       OPT_END()
+};
+
+int cmd_record(int argc, const char **argv, const char *prefix)
+{
+       int counter;
+
+       argc = parse_options(argc, argv, options, record_usage, 0);
+       if (!argc && target_pid == -1 && !system_wide)
+               usage_with_options(record_usage, options);
+
+       if (!nr_counters)
+               nr_counters = 1;
+
+       for (counter = 0; counter < nr_counters; counter++) {
+               if (attrs[counter].sample_period)
+                       continue;
+
+               attrs[counter].sample_period = default_interval;
+       }
+
+       return __cmd_record(argc, argv);
+}
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c

new file mode 100644 (file)

index 0000000..82fa93b
--- /dev/null
+++ b/tools/perf/builtin-report.c
@@ -0,0 +1,1316 @@
+/*
+ * builtin-report.c
+ *
+ * Builtin report command: Analyze the perf.data input file,
+ * look up and read DSOs and symbol information and display
+ * a histogram of results, along various sorting keys.
+ */
+#include "builtin.h"
+
+#include "util/util.h"
+
+#include "util/color.h"
+#include "util/list.h"
+#include "util/cache.h"
+#include "util/rbtree.h"
+#include "util/symbol.h"
+#include "util/string.h"
+
+#include "perf.h"
+
+#include "util/parse-options.h"
+#include "util/parse-events.h"
+
+#define SHOW_KERNEL    1
+#define SHOW_USER      2
+#define SHOW_HV                4
+
+static char            const *input_name = "perf.data";
+static char            *vmlinux = NULL;
+
+static char            default_sort_order[] = "comm,dso";
+static char            *sort_order = default_sort_order;
+
+static int             input;
+static int             show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
+
+static int             dump_trace = 0;
+#define dprintf(x...)  do { if (dump_trace) printf(x); } while (0)
+
+static int             verbose;
+static int             full_paths;
+
+static unsigned long   page_size;
+static unsigned long   mmap_window = 32;
+
+struct ip_event {
+       struct perf_event_header header;
+       __u64 ip;
+       __u32 pid, tid;
+       __u64 period;
+};
+
+struct mmap_event {
+       struct perf_event_header header;
+       __u32 pid, tid;
+       __u64 start;
+       __u64 len;
+       __u64 pgoff;
+       char filename[PATH_MAX];
+};
+
+struct comm_event {
+       struct perf_event_header header;
+       __u32 pid, tid;
+       char comm[16];
+};
+
+struct fork_event {
+       struct perf_event_header header;
+       __u32 pid, ppid;
+};
+
+struct period_event {
+       struct perf_event_header header;
+       __u64 time;
+       __u64 id;
+       __u64 sample_period;
+};
+
+typedef union event_union {
+       struct perf_event_header        header;
+       struct ip_event                 ip;
+       struct mmap_event               mmap;
+       struct comm_event               comm;
+       struct fork_event               fork;
+       struct period_event             period;
+} event_t;
+
+static LIST_HEAD(dsos);
+static struct dso *kernel_dso;
+static struct dso *vdso;
+
+static void dsos__add(struct dso *dso)
+{
+       list_add_tail(&dso->node, &dsos);
+}
+
+static struct dso *dsos__find(const char *name)
+{
+       struct dso *pos;
+
+       list_for_each_entry(pos, &dsos, node)
+               if (strcmp(pos->name, name) == 0)
+                       return pos;
+       return NULL;
+}
+
+static struct dso *dsos__findnew(const char *name)
+{
+       struct dso *dso = dsos__find(name);
+       int nr;
+
+       if (dso)
+               return dso;
+
+       dso = dso__new(name, 0);
+       if (!dso)
+               goto out_delete_dso;
+
+       nr = dso__load(dso, NULL, verbose);
+       if (nr < 0) {
+               if (verbose)
+                       fprintf(stderr, "Failed to open: %s\n", name);
+               goto out_delete_dso;
+       }
+       if (!nr && verbose) {
+               fprintf(stderr,
+               "No symbols found in: %s, maybe install a debug package?\n",
+                               name);
+       }
+
+       dsos__add(dso);
+
+       return dso;
+
+out_delete_dso:
+       dso__delete(dso);
+       return NULL;
+}
+
+static void dsos__fprintf(FILE *fp)
+{
+       struct dso *pos;
+
+       list_for_each_entry(pos, &dsos, node)
+               dso__fprintf(pos, fp);
+}
+
+static struct symbol *vdso__find_symbol(struct dso *dso, __u64 ip)
+{
+       return dso__find_symbol(kernel_dso, ip);
+}
+
+static int load_kernel(void)
+{
+       int err;
+
+       kernel_dso = dso__new("[kernel]", 0);
+       if (!kernel_dso)
+               return -1;
+
+       err = dso__load_kernel(kernel_dso, vmlinux, NULL, verbose);
+       if (err) {
+               dso__delete(kernel_dso);
+               kernel_dso = NULL;
+       } else
+               dsos__add(kernel_dso);
+
+       vdso = dso__new("[vdso]", 0);
+       if (!vdso)
+               return -1;
+
+       vdso->find_symbol = vdso__find_symbol;
+
+       dsos__add(vdso);
+
+       return err;
+}
+
+static char __cwd[PATH_MAX];
+static char *cwd = __cwd;
+static int cwdlen;
+
+static int strcommon(const char *pathname)
+{
+       int n = 0;
+
+       while (pathname[n] == cwd[n] && n < cwdlen)
+               ++n;
+
+       return n;
+}
+
+struct map {
+       struct list_head node;
+       __u64    start;
+       __u64    end;
+       __u64    pgoff;
+       __u64    (*map_ip)(struct map *, __u64);
+       struct dso       *dso;
+};
+
+static __u64 map__map_ip(struct map *map, __u64 ip)
+{
+       return ip - map->start + map->pgoff;
+}
+
+static __u64 vdso__map_ip(struct map *map, __u64 ip)
+{
+       return ip;
+}
+
+static inline int is_anon_memory(const char *filename)
+{
+     return strcmp(filename, "//anon") == 0;
+}
+
+static struct map *map__new(struct mmap_event *event)
+{
+       struct map *self = malloc(sizeof(*self));
+
+       if (self != NULL) {
+               const char *filename = event->filename;
+               char newfilename[PATH_MAX];
+               int anon;
+
+               if (cwd) {
+                       int n = strcommon(filename);
+
+                       if (n == cwdlen) {
+                               snprintf(newfilename, sizeof(newfilename),
+                                        ".%s", filename + n);
+                               filename = newfilename;
+                       }
+               }
+
+               anon = is_anon_memory(filename);
+
+               if (anon) {
+                       snprintf(newfilename, sizeof(newfilename), "/tmp/perf-%d.map", event->pid);
+                       filename = newfilename;
+               }
+
+               self->start = event->start;
+               self->end   = event->start + event->len;
+               self->pgoff = event->pgoff;
+
+               self->dso = dsos__findnew(filename);
+               if (self->dso == NULL)
+                       goto out_delete;
+
+               if (self->dso == vdso || anon)
+                       self->map_ip = vdso__map_ip;
+               else
+                       self->map_ip = map__map_ip;
+       }
+       return self;
+out_delete:
+       free(self);
+       return NULL;
+}
+
+static struct map *map__clone(struct map *self)
+{
+       struct map *map = malloc(sizeof(*self));
+
+       if (!map)
+               return NULL;
+
+       memcpy(map, self, sizeof(*self));
+
+       return map;
+}
+
+static int map__overlap(struct map *l, struct map *r)
+{
+       if (l->start > r->start) {
+               struct map *t = l;
+               l = r;
+               r = t;
+       }
+
+       if (l->end > r->start)
+               return 1;
+
+       return 0;
+}
+
+static size_t map__fprintf(struct map *self, FILE *fp)
+{
+       return fprintf(fp, " %Lx-%Lx %Lx %s\n",
+                      self->start, self->end, self->pgoff, self->dso->name);
+}
+
+
+struct thread {
+       struct rb_node   rb_node;
+       struct list_head maps;
+       pid_t            pid;
+       char             *comm;
+};
+
+static struct thread *thread__new(pid_t pid)
+{
+       struct thread *self = malloc(sizeof(*self));
+
+       if (self != NULL) {
+               self->pid = pid;
+               self->comm = malloc(32);
+               if (self->comm)
+                       snprintf(self->comm, 32, ":%d", self->pid);
+               INIT_LIST_HEAD(&self->maps);
+       }
+
+       return self;
+}
+
+static int thread__set_comm(struct thread *self, const char *comm)
+{
+       if (self->comm)
+               free(self->comm);
+       self->comm = strdup(comm);
+       return self->comm ? 0 : -ENOMEM;
+}
+
+static size_t thread__fprintf(struct thread *self, FILE *fp)
+{
+       struct map *pos;
+       size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm);
+
+       list_for_each_entry(pos, &self->maps, node)
+               ret += map__fprintf(pos, fp);
+
+       return ret;
+}
+
+
+static struct rb_root threads;
+static struct thread *last_match;
+
+static struct thread *threads__findnew(pid_t pid)
+{
+       struct rb_node **p = &threads.rb_node;
+       struct rb_node *parent = NULL;
+       struct thread *th;
+
+       /*
+        * Font-end cache - PID lookups come in blocks,
+        * so most of the time we dont have to look up
+        * the full rbtree:
+        */
+       if (last_match && last_match->pid == pid)
+               return last_match;
+
+       while (*p != NULL) {
+               parent = *p;
+               th = rb_entry(parent, struct thread, rb_node);
+
+               if (th->pid == pid) {
+                       last_match = th;
+                       return th;
+               }
+
+               if (pid < th->pid)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
+       }
+
+       th = thread__new(pid);
+       if (th != NULL) {
+               rb_link_node(&th->rb_node, parent, p);
+               rb_insert_color(&th->rb_node, &threads);
+               last_match = th;
+       }
+
+       return th;
+}
+
+static void thread__insert_map(struct thread *self, struct map *map)
+{
+       struct map *pos, *tmp;
+
+       list_for_each_entry_safe(pos, tmp, &self->maps, node) {
+               if (map__overlap(pos, map)) {
+                       list_del_init(&pos->node);
+                       /* XXX leaks dsos */
+                       free(pos);
+               }
+       }
+
+       list_add_tail(&map->node, &self->maps);
+}
+
+static int thread__fork(struct thread *self, struct thread *parent)
+{
+       struct map *map;
+
+       if (self->comm)
+               free(self->comm);
+       self->comm = strdup(parent->comm);
+       if (!self->comm)
+               return -ENOMEM;
+
+       list_for_each_entry(map, &parent->maps, node) {
+               struct map *new = map__clone(map);
+               if (!new)
+                       return -ENOMEM;
+               thread__insert_map(self, new);
+       }
+
+       return 0;
+}
+
+static struct map *thread__find_map(struct thread *self, __u64 ip)
+{
+       struct map *pos;
+
+       if (self == NULL)
+               return NULL;
+
+       list_for_each_entry(pos, &self->maps, node)
+               if (ip >= pos->start && ip <= pos->end)
+                       return pos;
+
+       return NULL;
+}
+
+static size_t threads__fprintf(FILE *fp)
+{
+       size_t ret = 0;
+       struct rb_node *nd;
+
+       for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
+               struct thread *pos = rb_entry(nd, struct thread, rb_node);
+
+               ret += thread__fprintf(pos, fp);
+       }
+
+       return ret;
+}
+
+/*
+ * histogram, sorted on item, collects counts
+ */
+
+static struct rb_root hist;
+
+struct hist_entry {
+       struct rb_node   rb_node;
+
+       struct thread    *thread;
+       struct map       *map;
+       struct dso       *dso;
+       struct symbol    *sym;
+       __u64            ip;
+       char             level;
+
+       __u64            count;
+};
+
+/*
+ * configurable sorting bits
+ */
+
+struct sort_entry {
+       struct list_head list;
+
+       char *header;
+
+       int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
+       int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
+       size_t  (*print)(FILE *fp, struct hist_entry *);
+};
+
+/* --sort pid */
+
+static int64_t
+sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       return right->thread->pid - left->thread->pid;
+}
+
+static size_t
+sort__thread_print(FILE *fp, struct hist_entry *self)
+{
+       return fprintf(fp, "%16s:%5d", self->thread->comm ?: "", self->thread->pid);
+}
+
+static struct sort_entry sort_thread = {
+       .header = "         Command:  Pid",
+       .cmp    = sort__thread_cmp,
+       .print  = sort__thread_print,
+};
+
+/* --sort comm */
+
+static int64_t
+sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       return right->thread->pid - left->thread->pid;
+}
+
+static int64_t
+sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
+{
+       char *comm_l = left->thread->comm;
+       char *comm_r = right->thread->comm;
+
+       if (!comm_l || !comm_r) {
+               if (!comm_l && !comm_r)
+                       return 0;
+               else if (!comm_l)
+                       return -1;
+               else
+                       return 1;
+       }
+
+       return strcmp(comm_l, comm_r);
+}
+
+static size_t
+sort__comm_print(FILE *fp, struct hist_entry *self)
+{
+       return fprintf(fp, "%16s", self->thread->comm);
+}
+
+static struct sort_entry sort_comm = {
+       .header         = "         Command",
+       .cmp            = sort__comm_cmp,
+       .collapse       = sort__comm_collapse,
+       .print          = sort__comm_print,
+};
+
+/* --sort dso */
+
+static int64_t
+sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       struct dso *dso_l = left->dso;
+       struct dso *dso_r = right->dso;
+
+       if (!dso_l || !dso_r) {
+               if (!dso_l && !dso_r)
+                       return 0;
+               else if (!dso_l)
+                       return -1;
+               else
+                       return 1;
+       }
+
+       return strcmp(dso_l->name, dso_r->name);
+}
+
+static size_t
+sort__dso_print(FILE *fp, struct hist_entry *self)
+{
+       if (self->dso)
+               return fprintf(fp, "%-25s", self->dso->name);
+
+       return fprintf(fp, "%016llx         ", (__u64)self->ip);
+}
+
+static struct sort_entry sort_dso = {
+       .header = "Shared Object            ",
+       .cmp    = sort__dso_cmp,
+       .print  = sort__dso_print,
+};
+
+/* --sort symbol */
+
+static int64_t
+sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       __u64 ip_l, ip_r;
+
+       if (left->sym == right->sym)
+               return 0;
+
+       ip_l = left->sym ? left->sym->start : left->ip;
+       ip_r = right->sym ? right->sym->start : right->ip;
+
+       return (int64_t)(ip_r - ip_l);
+}
+
+static size_t
+sort__sym_print(FILE *fp, struct hist_entry *self)
+{
+       size_t ret = 0;
+
+       if (verbose)
+               ret += fprintf(fp, "%#018llx  ", (__u64)self->ip);
+
+       if (self->sym) {
+               ret += fprintf(fp, "[%c] %s",
+                       self->dso == kernel_dso ? 'k' : '.', self->sym->name);
+       } else {
+               ret += fprintf(fp, "%#016llx", (__u64)self->ip);
+       }
+
+       return ret;
+}
+
+static struct sort_entry sort_sym = {
+       .header = "Symbol",
+       .cmp    = sort__sym_cmp,
+       .print  = sort__sym_print,
+};
+
+static int sort__need_collapse = 0;
+
+struct sort_dimension {
+       char                    *name;
+       struct sort_entry       *entry;
+       int                     taken;
+};
+
+static struct sort_dimension sort_dimensions[] = {
+       { .name = "pid",        .entry = &sort_thread,  },
+       { .name = "comm",       .entry = &sort_comm,    },
+       { .name = "dso",        .entry = &sort_dso,     },
+       { .name = "symbol",     .entry = &sort_sym,     },
+};
+
+static LIST_HEAD(hist_entry__sort_list);
+
+static int sort_dimension__add(char *tok)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) {
+               struct sort_dimension *sd = &sort_dimensions[i];
+
+               if (sd->taken)
+                       continue;
+
+               if (strncasecmp(tok, sd->name, strlen(tok)))
+                       continue;
+
+               if (sd->entry->collapse)
+                       sort__need_collapse = 1;
+
+               list_add_tail(&sd->entry->list, &hist_entry__sort_list);
+               sd->taken = 1;
+
+               return 0;
+       }
+
+       return -ESRCH;
+}
+
+static int64_t
+hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       struct sort_entry *se;
+       int64_t cmp = 0;
+
+       list_for_each_entry(se, &hist_entry__sort_list, list) {
+               cmp = se->cmp(left, right);
+               if (cmp)
+                       break;
+       }
+
+       return cmp;
+}
+
+static int64_t
+hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
+{
+       struct sort_entry *se;
+       int64_t cmp = 0;
+
+       list_for_each_entry(se, &hist_entry__sort_list, list) {
+               int64_t (*f)(struct hist_entry *, struct hist_entry *);
+
+               f = se->collapse ?: se->cmp;
+
+               cmp = f(left, right);
+               if (cmp)
+                       break;
+       }
+
+       return cmp;
+}
+
+static size_t
+hist_entry__fprintf(FILE *fp, struct hist_entry *self, __u64 total_samples)
+{
+       struct sort_entry *se;
+       size_t ret;
+
+       if (total_samples) {
+               double percent = self->count * 100.0 / total_samples;
+               char *color = PERF_COLOR_NORMAL;
+
+               /*
+                * We color high-overhead entries in red, mid-overhead
+                * entries in green - and keep the low overhead places
+                * normal:
+                */
+               if (percent >= 5.0) {
+                       color = PERF_COLOR_RED;
+               } else {
+                       if (percent >= 0.5)
+                               color = PERF_COLOR_GREEN;
+               }
+
+               ret = color_fprintf(fp, color, "   %6.2f%%",
+                               (self->count * 100.0) / total_samples);
+       } else
+               ret = fprintf(fp, "%12Ld ", self->count);
+
+       list_for_each_entry(se, &hist_entry__sort_list, list) {
+               fprintf(fp, "  ");
+               ret += se->print(fp, self);
+       }
+
+       ret += fprintf(fp, "\n");
+
+       return ret;
+}
+
+/*
+ * collect histogram counts
+ */
+
+static int
+hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
+               struct symbol *sym, __u64 ip, char level, __u64 count)
+{
+       struct rb_node **p = &hist.rb_node;
+       struct rb_node *parent = NULL;
+       struct hist_entry *he;
+       struct hist_entry entry = {
+               .thread = thread,
+               .map    = map,
+               .dso    = dso,
+               .sym    = sym,
+               .ip     = ip,
+               .level  = level,
+               .count  = count,
+       };
+       int cmp;
+
+       while (*p != NULL) {
+               parent = *p;
+               he = rb_entry(parent, struct hist_entry, rb_node);
+
+               cmp = hist_entry__cmp(&entry, he);
+
+               if (!cmp) {
+                       he->count += count;
+                       return 0;
+               }
+
+               if (cmp < 0)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
+       }
+
+       he = malloc(sizeof(*he));
+       if (!he)
+               return -ENOMEM;
+       *he = entry;
+       rb_link_node(&he->rb_node, parent, p);
+       rb_insert_color(&he->rb_node, &hist);
+
+       return 0;
+}
+
+static void hist_entry__free(struct hist_entry *he)
+{
+       free(he);
+}
+
+/*
+ * collapse the histogram
+ */
+
+static struct rb_root collapse_hists;
+
+static void collapse__insert_entry(struct hist_entry *he)
+{
+       struct rb_node **p = &collapse_hists.rb_node;
+       struct rb_node *parent = NULL;
+       struct hist_entry *iter;
+       int64_t cmp;
+
+       while (*p != NULL) {
+               parent = *p;
+               iter = rb_entry(parent, struct hist_entry, rb_node);
+
+               cmp = hist_entry__collapse(iter, he);
+
+               if (!cmp) {
+                       iter->count += he->count;
+                       hist_entry__free(he);
+                       return;
+               }
+
+               if (cmp < 0)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
+       }
+
+       rb_link_node(&he->rb_node, parent, p);
+       rb_insert_color(&he->rb_node, &collapse_hists);
+}
+
+static void collapse__resort(void)
+{
+       struct rb_node *next;
+       struct hist_entry *n;
+
+       if (!sort__need_collapse)
+               return;
+
+       next = rb_first(&hist);
+       while (next) {
+               n = rb_entry(next, struct hist_entry, rb_node);
+               next = rb_next(&n->rb_node);
+
+               rb_erase(&n->rb_node, &hist);
+               collapse__insert_entry(n);
+       }
+}
+
+/*
+ * reverse the map, sort on count.
+ */
+
+static struct rb_root output_hists;
+
+static void output__insert_entry(struct hist_entry *he)
+{
+       struct rb_node **p = &output_hists.rb_node;
+       struct rb_node *parent = NULL;
+       struct hist_entry *iter;
+
+       while (*p != NULL) {
+               parent = *p;
+               iter = rb_entry(parent, struct hist_entry, rb_node);
+
+               if (he->count > iter->count)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
+       }
+
+       rb_link_node(&he->rb_node, parent, p);
+       rb_insert_color(&he->rb_node, &output_hists);
+}
+
+static void output__resort(void)
+{
+       struct rb_node *next;
+       struct hist_entry *n;
+       struct rb_root *tree = &hist;
+
+       if (sort__need_collapse)
+               tree = &collapse_hists;
+
+       next = rb_first(tree);
+
+       while (next) {
+               n = rb_entry(next, struct hist_entry, rb_node);
+               next = rb_next(&n->rb_node);
+
+               rb_erase(&n->rb_node, tree);
+               output__insert_entry(n);
+       }
+}
+
+static size_t output__fprintf(FILE *fp, __u64 total_samples)
+{
+       struct hist_entry *pos;
+       struct sort_entry *se;
+       struct rb_node *nd;
+       size_t ret = 0;
+
+       fprintf(fp, "\n");
+       fprintf(fp, "#\n");
+       fprintf(fp, "# (%Ld samples)\n", (__u64)total_samples);
+       fprintf(fp, "#\n");
+
+       fprintf(fp, "# Overhead");
+       list_for_each_entry(se, &hist_entry__sort_list, list)
+               fprintf(fp, "  %s", se->header);
+       fprintf(fp, "\n");
+
+       fprintf(fp, "# ........");
+       list_for_each_entry(se, &hist_entry__sort_list, list) {
+               int i;
+
+               fprintf(fp, "  ");
+               for (i = 0; i < strlen(se->header); i++)
+                       fprintf(fp, ".");
+       }
+       fprintf(fp, "\n");
+
+       fprintf(fp, "#\n");
+
+       for (nd = rb_first(&output_hists); nd; nd = rb_next(nd)) {
+               pos = rb_entry(nd, struct hist_entry, rb_node);
+               ret += hist_entry__fprintf(fp, pos, total_samples);
+       }
+
+       if (!strcmp(sort_order, default_sort_order)) {
+               fprintf(fp, "#\n");
+               fprintf(fp, "# (For more details, try: perf report --sort comm,dso,symbol)\n");
+               fprintf(fp, "#\n");
+       }
+       fprintf(fp, "\n");
+
+       return ret;
+}
+
+static void register_idle_thread(void)
+{
+       struct thread *thread = threads__findnew(0);
+
+       if (thread == NULL ||
+                       thread__set_comm(thread, "[idle]")) {
+               fprintf(stderr, "problem inserting idle task.\n");
+               exit(-1);
+       }
+}
+
+static unsigned long total = 0,
+                    total_mmap = 0,
+                    total_comm = 0,
+                    total_fork = 0,
+                    total_unknown = 0;
+
+static int
+process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
+{
+       char level;
+       int show = 0;
+       struct dso *dso = NULL;
+       struct thread *thread = threads__findnew(event->ip.pid);
+       __u64 ip = event->ip.ip;
+       __u64 period = 1;
+       struct map *map = NULL;
+
+       if (event->header.type & PERF_SAMPLE_PERIOD)
+               period = event->ip.period;
+
+       dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p period: %Ld\n",
+               (void *)(offset + head),
+               (void *)(long)(event->header.size),
+               event->header.misc,
+               event->ip.pid,
+               (void *)(long)ip,
+               (long long)period);
+
+       dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
+       if (thread == NULL) {
+               fprintf(stderr, "problem processing %d event, skipping it.\n",
+                       event->header.type);
+               return -1;
+       }
+
+       if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
+               show = SHOW_KERNEL;
+               level = 'k';
+
+               dso = kernel_dso;
+
+               dprintf(" ...... dso: %s\n", dso->name);
+
+       } else if (event->header.misc & PERF_EVENT_MISC_USER) {
+
+               show = SHOW_USER;
+               level = '.';
+
+               map = thread__find_map(thread, ip);
+               if (map != NULL) {
+                       ip = map->map_ip(map, ip);
+                       dso = map->dso;
+               } else {
+                       /*
+                        * If this is outside of all known maps,
+                        * and is a negative address, try to look it
+                        * up in the kernel dso, as it might be a
+                        * vsyscall (which executes in user-mode):
+                        */
+                       if ((long long)ip < 0)
+                               dso = kernel_dso;
+               }
+               dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
+
+       } else {
+               show = SHOW_HV;
+               level = 'H';
+               dprintf(" ...... dso: [hypervisor]\n");
+       }
+
+       if (show & show_mask) {
+               struct symbol *sym = NULL;
+
+               if (dso)
+                       sym = dso->find_symbol(dso, ip);
+
+               if (hist_entry__add(thread, map, dso, sym, ip, level, period)) {
+                       fprintf(stderr,
+               "problem incrementing symbol count, skipping event\n");
+                       return -1;
+               }
+       }
+       total += period;
+
+       return 0;
+}
+
+static int
+process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
+{
+       struct thread *thread = threads__findnew(event->mmap.pid);
+       struct map *map = map__new(&event->mmap);
+
+       dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n",
+               (void *)(offset + head),
+               (void *)(long)(event->header.size),
+               event->mmap.pid,
+               (void *)(long)event->mmap.start,
+               (void *)(long)event->mmap.len,
+               (void *)(long)event->mmap.pgoff,
+               event->mmap.filename);
+
+       if (thread == NULL || map == NULL) {
+               dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n");
+               return 0;
+       }
+
+       thread__insert_map(thread, map);
+       total_mmap++;
+
+       return 0;
+}
+
+static int
+process_comm_event(event_t *event, unsigned long offset, unsigned long head)
+{
+       struct thread *thread = threads__findnew(event->comm.pid);
+
+       dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+               (void *)(offset + head),
+               (void *)(long)(event->header.size),
+               event->comm.comm, event->comm.pid);
+
+       if (thread == NULL ||
+           thread__set_comm(thread, event->comm.comm)) {
+               dprintf("problem processing PERF_EVENT_COMM, skipping event.\n");
+               return -1;
+       }
+       total_comm++;
+
+       return 0;
+}
+
+static int
+process_fork_event(event_t *event, unsigned long offset, unsigned long head)
+{
+       struct thread *thread = threads__findnew(event->fork.pid);
+       struct thread *parent = threads__findnew(event->fork.ppid);
+
+       dprintf("%p [%p]: PERF_EVENT_FORK: %d:%d\n",
+               (void *)(offset + head),
+               (void *)(long)(event->header.size),
+               event->fork.pid, event->fork.ppid);
+
+       if (!thread || !parent || thread__fork(thread, parent)) {
+               dprintf("problem processing PERF_EVENT_FORK, skipping event.\n");
+               return -1;
+       }
+       total_fork++;
+
+       return 0;
+}
+
+static int
+process_period_event(event_t *event, unsigned long offset, unsigned long head)
+{
+       dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n",
+               (void *)(offset + head),
+               (void *)(long)(event->header.size),
+               event->period.time,
+               event->period.id,
+               event->period.sample_period);
+
+       return 0;
+}
+
+static int
+process_event(event_t *event, unsigned long offset, unsigned long head)
+{
+       if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
+               return process_overflow_event(event, offset, head);
+
+       switch (event->header.type) {
+       case PERF_EVENT_MMAP:
+               return process_mmap_event(event, offset, head);
+
+       case PERF_EVENT_COMM:
+               return process_comm_event(event, offset, head);
+
+       case PERF_EVENT_FORK:
+               return process_fork_event(event, offset, head);
+
+       case PERF_EVENT_PERIOD:
+               return process_period_event(event, offset, head);
+       /*
+        * We dont process them right now but they are fine:
+        */
+
+       case PERF_EVENT_THROTTLE:
+       case PERF_EVENT_UNTHROTTLE:
+               return 0;
+
+       default:
+               return -1;
+       }
+
+       return 0;
+}
+
+static int __cmd_report(void)
+{
+       int ret, rc = EXIT_FAILURE;
+       unsigned long offset = 0;
+       unsigned long head = 0;
+       struct stat stat;
+       event_t *event;
+       uint32_t size;
+       char *buf;
+
+       register_idle_thread();
+
+       input = open(input_name, O_RDONLY);
+       if (input < 0) {
+               fprintf(stderr, " failed to open file: %s", input_name);
+               if (!strcmp(input_name, "perf.data"))
+                       fprintf(stderr, "  (try 'perf record' first)");
+               fprintf(stderr, "\n");
+               exit(-1);
+       }
+
+       ret = fstat(input, &stat);
+       if (ret < 0) {
+               perror("failed to stat file");
+               exit(-1);
+       }
+
+       if (!stat.st_size) {
+               fprintf(stderr, "zero-sized file, nothing to do!\n");
+               exit(0);
+       }
+
+       if (load_kernel() < 0) {
+               perror("failed to load kernel symbols");
+               return EXIT_FAILURE;
+       }
+
+       if (!full_paths) {
+               if (getcwd(__cwd, sizeof(__cwd)) == NULL) {
+                       perror("failed to get the current directory");
+                       return EXIT_FAILURE;
+               }
+               cwdlen = strlen(cwd);
+       } else {
+               cwd = NULL;
+               cwdlen = 0;
+       }
+remap:
+       buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
+                          MAP_SHARED, input, offset);
+       if (buf == MAP_FAILED) {
+               perror("failed to mmap file");
+               exit(-1);
+       }
+
+more:
+       event = (event_t *)(buf + head);
+
+       size = event->header.size;
+       if (!size)
+               size = 8;
+
+       if (head + event->header.size >= page_size * mmap_window) {
+               unsigned long shift = page_size * (head / page_size);
+               int ret;
+
+               ret = munmap(buf, page_size * mmap_window);
+               assert(ret == 0);
+
+               offset += shift;
+               head -= shift;
+               goto remap;
+       }
+
+       size = event->header.size;
+
+       dprintf("%p [%p]: event: %d\n",
+                       (void *)(offset + head),
+                       (void *)(long)event->header.size,
+                       event->header.type);
+
+       if (!size || process_event(event, offset, head) < 0) {
+
+               dprintf("%p [%p]: skipping unknown header type: %d\n",
+                       (void *)(offset + head),
+                       (void *)(long)(event->header.size),
+                       event->header.type);
+
+               total_unknown++;
+
+               /*
+                * assume we lost track of the stream, check alignment, and
+                * increment a single u64 in the hope to catch on again 'soon'.
+                */
+
+               if (unlikely(head & 7))
+                       head &= ~7ULL;
+
+               size = 8;
+       }
+
+       head += size;
+
+       if (offset + head < stat.st_size)
+               goto more;
+
+       rc = EXIT_SUCCESS;
+       close(input);
+
+       dprintf("      IP events: %10ld\n", total);
+       dprintf("    mmap events: %10ld\n", total_mmap);
+       dprintf("    comm events: %10ld\n", total_comm);
+       dprintf("    fork events: %10ld\n", total_fork);
+       dprintf(" unknown events: %10ld\n", total_unknown);
+
+       if (dump_trace)
+               return 0;
+
+       if (verbose >= 3)
+               threads__fprintf(stdout);
+
+       if (verbose >= 2)
+               dsos__fprintf(stdout);
+
+       collapse__resort();
+       output__resort();
+       output__fprintf(stdout, total);
+
+       return rc;
+}
+
+static const char * const report_usage[] = {
+       "perf report [<options>] <command>",
+       NULL
+};
+
+static const struct option options[] = {
+       OPT_STRING('i', "input", &input_name, "file",
+                   "input file name"),
+       OPT_BOOLEAN('v', "verbose", &verbose,
+                   "be more verbose (show symbol address, etc)"),
+       OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+                   "dump raw trace in ASCII"),
+       OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
+       OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
+                  "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
+       OPT_BOOLEAN('P', "full-paths", &full_paths,
+                   "Don't shorten the pathnames taking into account the cwd"),
+       OPT_END()
+};
+
+static void setup_sorting(void)
+{
+       char *tmp, *tok, *str = strdup(sort_order);
+
+       for (tok = strtok_r(str, ", ", &tmp);
+                       tok; tok = strtok_r(NULL, ", ", &tmp)) {
+               if (sort_dimension__add(tok) < 0) {
+                       error("Unknown --sort key: `%s'", tok);
+                       usage_with_options(report_usage, options);
+               }
+       }
+
+       free(str);
+}
+
+int cmd_report(int argc, const char **argv, const char *prefix)
+{
+       symbol__init();
+
+       page_size = getpagesize();
+
+       argc = parse_options(argc, argv, options, report_usage, 0);
+
+       setup_sorting();
+
+       /*
+        * Any (unrecognized) arguments left?
+        */
+       if (argc)
+               usage_with_options(report_usage, options);
+
+       setup_pager();
+
+       return __cmd_report();
+}
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c

new file mode 100644 (file)

index 0000000..c43e4a9
--- /dev/null
+++ b/tools/perf/builtin-stat.c
@@ -0,0 +1,367 @@
+/*
+ * builtin-stat.c
+ *
+ * Builtin stat command: Give a precise performance counters summary
+ * overview about any workload, CPU or specific PID.
+ *
+ * Sample output:
+
+   $ perf stat ~/hackbench 10
+   Time: 0.104
+
+    Performance counter stats for '/home/mingo/hackbench':
+
+       1255.538611  task clock ticks     #      10.143 CPU utilization factor
+             54011  context switches     #       0.043 M/sec
+               385  CPU migrations       #       0.000 M/sec
+             17755  pagefaults           #       0.014 M/sec
+        3808323185  CPU cycles           #    3033.219 M/sec
+        1575111190  instructions         #    1254.530 M/sec
+          17367895  cache references     #      13.833 M/sec
+           7674421  cache misses         #       6.112 M/sec
+
+    Wall-clock time elapsed:   123.786620 msecs
+
+ *
+ * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+ *
+ * Improvements and fixes by:
+ *
+ *   Arjan van de Ven <arjan@linux.intel.com>
+ *   Yanmin Zhang <yanmin.zhang@intel.com>
+ *   Wu Fengguang <fengguang.wu@intel.com>
+ *   Mike Galbraith <efault@gmx.de>
+ *   Paul Mackerras <paulus@samba.org>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+
+#include "perf.h"
+#include "builtin.h"
+#include "util/util.h"
+#include "util/parse-options.h"
+#include "util/parse-events.h"
+
+#include <sys/prctl.h>
+
+static struct perf_counter_attr default_attrs[MAX_COUNTERS] = {
+
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK     },
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS },
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS    },
+
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES     },
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS   },
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES   },
+
+};
+
+static int                     system_wide                     =  0;
+static int                     inherit                         =  1;
+static int                     verbose                         =  0;
+
+static int                     fd[MAX_NR_CPUS][MAX_COUNTERS];
+
+static int                     target_pid                      = -1;
+static int                     nr_cpus                         =  0;
+static unsigned int            page_size;
+
+static int                     scale                           =  1;
+
+static const unsigned int default_count[] = {
+       1000000,
+       1000000,
+         10000,
+         10000,
+       1000000,
+         10000,
+};
+
+static __u64                   event_res[MAX_COUNTERS][3];
+static __u64                   event_scaled[MAX_COUNTERS];
+
+static __u64                   runtime_nsecs;
+static __u64                   walltime_nsecs;
+static __u64                   runtime_cycles;
+
+static void create_perf_stat_counter(int counter)
+{
+       struct perf_counter_attr *attr = attrs + counter;
+
+       if (scale)
+               attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
+                                   PERF_FORMAT_TOTAL_TIME_RUNNING;
+
+       if (system_wide) {
+               int cpu;
+               for (cpu = 0; cpu < nr_cpus; cpu ++) {
+                       fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0);
+                       if (fd[cpu][counter] < 0 && verbose) {
+                               printf("Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n", counter, fd[cpu][counter], strerror(errno));
+                       }
+               }
+       } else {
+               attr->inherit   = inherit;
+               attr->disabled  = 1;
+
+               fd[0][counter] = sys_perf_counter_open(attr, 0, -1, -1, 0);
+               if (fd[0][counter] < 0 && verbose) {
+                       printf("Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n", counter, fd[0][counter], strerror(errno));
+               }
+       }
+}
+
+/*
+ * Does the counter have nsecs as a unit?
+ */
+static inline int nsec_counter(int counter)
+{
+       if (attrs[counter].type != PERF_TYPE_SOFTWARE)
+               return 0;
+
+       if (attrs[counter].config == PERF_COUNT_SW_CPU_CLOCK)
+               return 1;
+
+       if (attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
+               return 1;
+
+       return 0;
+}
+
+/*
+ * Read out the results of a single counter:
+ */
+static void read_counter(int counter)
+{
+       __u64 *count, single_count[3];
+       ssize_t res;
+       int cpu, nv;
+       int scaled;
+
+       count = event_res[counter];
+
+       count[0] = count[1] = count[2] = 0;
+
+       nv = scale ? 3 : 1;
+       for (cpu = 0; cpu < nr_cpus; cpu ++) {
+               if (fd[cpu][counter] < 0)
+                       continue;
+
+               res = read(fd[cpu][counter], single_count, nv * sizeof(__u64));
+               assert(res == nv * sizeof(__u64));
+
+               count[0] += single_count[0];
+               if (scale) {
+                       count[1] += single_count[1];
+                       count[2] += single_count[2];
+               }
+       }
+
+       scaled = 0;
+       if (scale) {
+               if (count[2] == 0) {
+                       event_scaled[counter] = -1;
+                       count[0] = 0;
+                       return;
+               }
+
+               if (count[2] < count[1]) {
+                       event_scaled[counter] = 1;
+                       count[0] = (unsigned long long)
+                               ((double)count[0] * count[1] / count[2] + 0.5);
+               }
+       }
+       /*
+        * Save the full runtime - to allow normalization during printout:
+        */
+       if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
+               attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
+               runtime_nsecs = count[0];
+       if (attrs[counter].type == PERF_TYPE_HARDWARE &&
+               attrs[counter].config == PERF_COUNT_HW_CPU_CYCLES)
+               runtime_cycles = count[0];
+}
+
+/*
+ * Print out the results of a single counter:
+ */
+static void print_counter(int counter)
+{
+       __u64 *count;
+       int scaled;
+
+       count = event_res[counter];
+       scaled = event_scaled[counter];
+
+       if (scaled == -1) {
+               fprintf(stderr, " %14s  %-20s\n",
+                       "<not counted>", event_name(counter));
+               return;
+       }
+
+       if (nsec_counter(counter)) {
+               double msecs = (double)count[0] / 1000000;
+
+               fprintf(stderr, " %14.6f  %-20s",
+                       msecs, event_name(counter));
+               if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
+                       attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) {
+
+                       if (walltime_nsecs)
+                               fprintf(stderr, " # %11.3f CPU utilization factor",
+                                       (double)count[0] / (double)walltime_nsecs);
+               }
+       } else {
+               fprintf(stderr, " %14Ld  %-20s",
+                       count[0], event_name(counter));
+               if (runtime_nsecs)
+                       fprintf(stderr, " # %11.3f M/sec",
+                               (double)count[0]/runtime_nsecs*1000.0);
+               if (runtime_cycles &&
+                       attrs[counter].type == PERF_TYPE_HARDWARE &&
+                               attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) {
+
+                       fprintf(stderr, " # %1.3f per cycle",
+                               (double)count[0] / (double)runtime_cycles);
+               }
+       }
+       if (scaled)
+               fprintf(stderr, "  (scaled from %.2f%%)",
+                       (double) count[2] / count[1] * 100);
+       fprintf(stderr, "\n");
+}
+
+static int do_perf_stat(int argc, const char **argv)
+{
+       unsigned long long t0, t1;
+       int counter;
+       int status;
+       int pid;
+       int i;
+
+       if (!system_wide)
+               nr_cpus = 1;
+
+       for (counter = 0; counter < nr_counters; counter++)
+               create_perf_stat_counter(counter);
+
+       /*
+        * Enable counters and exec the command:
+        */
+       t0 = rdclock();
+       prctl(PR_TASK_PERF_COUNTERS_ENABLE);
+
+       if ((pid = fork()) < 0)
+               perror("failed to fork");
+
+       if (!pid) {
+               if (execvp(argv[0], (char **)argv)) {
+                       perror(argv[0]);
+                       exit(-1);
+               }
+       }
+
+       while (wait(&status) >= 0)
+               ;
+
+       prctl(PR_TASK_PERF_COUNTERS_DISABLE);
+       t1 = rdclock();
+
+       walltime_nsecs = t1 - t0;
+
+       fflush(stdout);
+
+       fprintf(stderr, "\n");
+       fprintf(stderr, " Performance counter stats for \'%s", argv[0]);
+
+       for (i = 1; i < argc; i++)
+               fprintf(stderr, " %s", argv[i]);
+
+       fprintf(stderr, "\':\n");
+       fprintf(stderr, "\n");
+
+       for (counter = 0; counter < nr_counters; counter++)
+               read_counter(counter);
+
+       for (counter = 0; counter < nr_counters; counter++)
+               print_counter(counter);
+
+
+       fprintf(stderr, "\n");
+       fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
+                       (double)(t1-t0)/1e6);
+       fprintf(stderr, "\n");
+
+       return 0;
+}
+
+static volatile int signr = -1;
+
+static void skip_signal(int signo)
+{
+       signr = signo;
+}
+
+static void sig_atexit(void)
+{
+       if (signr == -1)
+               return;
+
+       signal(signr, SIG_DFL);
+       kill(getpid(), signr);
+}
+
+static const char * const stat_usage[] = {
+       "perf stat [<options>] <command>",
+       NULL
+};
+
+static const struct option options[] = {
+       OPT_CALLBACK('e', "event", NULL, "event",
+                    "event selector. use 'perf list' to list available events",
+                    parse_events),
+       OPT_BOOLEAN('i', "inherit", &inherit,
+                   "child tasks inherit counters"),
+       OPT_INTEGER('p', "pid", &target_pid,
+                   "stat events on existing pid"),
+       OPT_BOOLEAN('a', "all-cpus", &system_wide,
+                           "system-wide collection from all CPUs"),
+       OPT_BOOLEAN('S', "scale", &scale,
+                           "scale/normalize counters"),
+       OPT_BOOLEAN('v', "verbose", &verbose,
+                   "be more verbose (show counter open errors, etc)"),
+       OPT_END()
+};
+
+int cmd_stat(int argc, const char **argv, const char *prefix)
+{
+       page_size = sysconf(_SC_PAGE_SIZE);
+
+       memcpy(attrs, default_attrs, sizeof(attrs));
+
+       argc = parse_options(argc, argv, options, stat_usage, 0);
+       if (!argc)
+               usage_with_options(stat_usage, options);
+
+       if (!nr_counters)
+               nr_counters = 8;
+
+       nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+       assert(nr_cpus <= MAX_NR_CPUS);
+       assert(nr_cpus >= 0);
+
+       /*
+        * We dont want to block the signals - that would cause
+        * child tasks to inherit that and Ctrl-C would not work.
+        * What we want is for Ctrl-C to work in the exec()-ed
+        * task, but being ignored by perf stat itself:
+        */
+       atexit(sig_atexit);
+       signal(SIGINT,  skip_signal);
+       signal(SIGALRM, skip_signal);
+       signal(SIGABRT, skip_signal);
+
+       return do_perf_stat(argc, argv);
+}
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c

new file mode 100644 (file)

index 0000000..fe338d3
--- /dev/null
+++ b/tools/perf/builtin-top.c
@@ -0,0 +1,736 @@
+/*
+ * builtin-top.c
+ *
+ * Builtin top command: Display a continuously updated profile of
+ * any workload, CPU or specific PID.
+ *
+ * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+ *
+ * Improvements and fixes by:
+ *
+ *   Arjan van de Ven <arjan@linux.intel.com>
+ *   Yanmin Zhang <yanmin.zhang@intel.com>
+ *   Wu Fengguang <fengguang.wu@intel.com>
+ *   Mike Galbraith <efault@gmx.de>
+ *   Paul Mackerras <paulus@samba.org>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+#include "builtin.h"
+
+#include "perf.h"
+
+#include "util/symbol.h"
+#include "util/color.h"
+#include "util/util.h"
+#include "util/rbtree.h"
+#include "util/parse-options.h"
+#include "util/parse-events.h"
+
+#include <assert.h>
+#include <fcntl.h>
+
+#include <stdio.h>
+
+#include <errno.h>
+#include <time.h>
+#include <sched.h>
+#include <pthread.h>
+
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <sys/uio.h>
+#include <sys/mman.h>
+
+#include <linux/unistd.h>
+#include <linux/types.h>
+
+static int                     fd[MAX_NR_CPUS][MAX_COUNTERS];
+
+static int                     system_wide                     =  0;
+
+static int                     default_interval                = 100000;
+
+static __u64                   count_filter                    =  5;
+static int                     print_entries                   = 15;
+
+static int                     target_pid                      = -1;
+static int                     profile_cpu                     = -1;
+static int                     nr_cpus                         =  0;
+static unsigned int            realtime_prio                   =  0;
+static int                     group                           =  0;
+static unsigned int            page_size;
+static unsigned int            mmap_pages                      = 16;
+static int                     freq                            =  0;
+static int                     verbose                         =  0;
+
+static char                    *sym_filter;
+static unsigned long           filter_start;
+static unsigned long           filter_end;
+
+static int                     delay_secs                      =  2;
+static int                     zero;
+static int                     dump_symtab;
+
+/*
+ * Symbols
+ */
+
+static __u64                   min_ip;
+static __u64                   max_ip = -1ll;
+
+struct sym_entry {
+       struct rb_node          rb_node;
+       struct list_head        node;
+       unsigned long           count[MAX_COUNTERS];
+       unsigned long           snap_count;
+       double                  weight;
+       int                     skip;
+};
+
+struct sym_entry               *sym_filter_entry;
+
+struct dso                     *kernel_dso;
+
+/*
+ * Symbols will be added here in record_ip and will get out
+ * after decayed.
+ */
+static LIST_HEAD(active_symbols);
+static pthread_mutex_t active_symbols_lock = PTHREAD_MUTEX_INITIALIZER;
+
+/*
+ * Ordering weight: count-1 * count-2 * ... / count-n
+ */
+static double sym_weight(const struct sym_entry *sym)
+{
+       double weight = sym->snap_count;
+       int counter;
+
+       for (counter = 1; counter < nr_counters-1; counter++)
+               weight *= sym->count[counter];
+
+       weight /= (sym->count[counter] + 1);
+
+       return weight;
+}
+
+static long                    samples;
+static long                    userspace_samples;
+static const char              CONSOLE_CLEAR[] = "\e[H\e[2J";
+
+static void __list_insert_active_sym(struct sym_entry *syme)
+{
+       list_add(&syme->node, &active_symbols);
+}
+
+static void list_remove_active_sym(struct sym_entry *syme)
+{
+       pthread_mutex_lock(&active_symbols_lock);
+       list_del_init(&syme->node);
+       pthread_mutex_unlock(&active_symbols_lock);
+}
+
+static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
+{
+       struct rb_node **p = &tree->rb_node;
+       struct rb_node *parent = NULL;
+       struct sym_entry *iter;
+
+       while (*p != NULL) {
+               parent = *p;
+               iter = rb_entry(parent, struct sym_entry, rb_node);
+
+               if (se->weight > iter->weight)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
+       }
+
+       rb_link_node(&se->rb_node, parent, p);
+       rb_insert_color(&se->rb_node, tree);
+}
+
+static void print_sym_table(void)
+{
+       int printed = 0, j;
+       int counter;
+       float samples_per_sec = samples/delay_secs;
+       float ksamples_per_sec = (samples-userspace_samples)/delay_secs;
+       float sum_ksamples = 0.0;
+       struct sym_entry *syme, *n;
+       struct rb_root tmp = RB_ROOT;
+       struct rb_node *nd;
+
+       samples = userspace_samples = 0;
+
+       /* Sort the active symbols */
+       pthread_mutex_lock(&active_symbols_lock);
+       syme = list_entry(active_symbols.next, struct sym_entry, node);
+       pthread_mutex_unlock(&active_symbols_lock);
+
+       list_for_each_entry_safe_from(syme, n, &active_symbols, node) {
+               syme->snap_count = syme->count[0];
+               if (syme->snap_count != 0) {
+                       syme->weight = sym_weight(syme);
+                       rb_insert_active_sym(&tmp, syme);
+                       sum_ksamples += syme->snap_count;
+
+                       for (j = 0; j < nr_counters; j++)
+                               syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8;
+               } else
+                       list_remove_active_sym(syme);
+       }
+
+       puts(CONSOLE_CLEAR);
+
+       printf(
+"------------------------------------------------------------------------------\n");
+       printf( "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%% [",
+               samples_per_sec,
+               100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec)));
+
+       if (nr_counters == 1) {
+               printf("%Ld", attrs[0].sample_period);
+               if (freq)
+                       printf("Hz ");
+               else
+                       printf(" ");
+       }
+
+       for (counter = 0; counter < nr_counters; counter++) {
+               if (counter)
+                       printf("/");
+
+               printf("%s", event_name(counter));
+       }
+
+       printf( "], ");
+
+       if (target_pid != -1)
+               printf(" (target_pid: %d", target_pid);
+       else
+               printf(" (all");
+
+       if (profile_cpu != -1)
+               printf(", cpu: %d)\n", profile_cpu);
+       else {
+               if (target_pid != -1)
+                       printf(")\n");
+               else
+                       printf(", %d CPUs)\n", nr_cpus);
+       }
+
+       printf("------------------------------------------------------------------------------\n\n");
+
+       if (nr_counters == 1)
+               printf("             samples    pcnt");
+       else
+               printf("  weight     samples    pcnt");
+
+       printf("         RIP          kernel function\n"
+                      "  ______     _______   _____   ________________   _______________\n\n"
+       );
+
+       for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
+               struct sym_entry *syme = rb_entry(nd, struct sym_entry, rb_node);
+               struct symbol *sym = (struct symbol *)(syme + 1);
+               char *color = PERF_COLOR_NORMAL;
+               double pcnt;
+
+               if (++printed > print_entries || syme->snap_count < count_filter)
+                       continue;
+
+               pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) /
+                                        sum_ksamples));
+
+               /*
+                * We color high-overhead entries in red, mid-overhead
+                * entries in green - and keep the low overhead places
+                * normal:
+                */
+               if (pcnt >= 5.0) {
+                       color = PERF_COLOR_RED;
+               } else {
+                       if (pcnt >= 0.5)
+                               color = PERF_COLOR_GREEN;
+               }
+
+               if (nr_counters == 1)
+                       printf("%20.2f - ", syme->weight);
+               else
+                       printf("%9.1f %10ld - ", syme->weight, syme->snap_count);
+
+               color_fprintf(stdout, color, "%4.1f%%", pcnt);
+               printf(" - %016llx : %s\n", sym->start, sym->name);
+       }
+}
+
+static void *display_thread(void *arg)
+{
+       struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
+       int delay_msecs = delay_secs * 1000;
+
+       printf("PerfTop refresh period: %d seconds\n", delay_secs);
+
+       do {
+               print_sym_table();
+       } while (!poll(&stdin_poll, 1, delay_msecs) == 1);
+
+       printf("key pressed - exiting.\n");
+       exit(0);
+
+       return NULL;
+}
+
+static int symbol_filter(struct dso *self, struct symbol *sym)
+{
+       static int filter_match;
+       struct sym_entry *syme;
+       const char *name = sym->name;
+
+       if (!strcmp(name, "_text") ||
+           !strcmp(name, "_etext") ||
+           !strcmp(name, "_sinittext") ||
+           !strncmp("init_module", name, 11) ||
+           !strncmp("cleanup_module", name, 14) ||
+           strstr(name, "_text_start") ||
+           strstr(name, "_text_end"))
+               return 1;
+
+       syme = dso__sym_priv(self, sym);
+       /* Tag samples to be skipped. */
+       if (!strcmp("default_idle", name) ||
+           !strcmp("cpu_idle", name) ||
+           !strcmp("enter_idle", name) ||
+           !strcmp("exit_idle", name) ||
+           !strcmp("mwait_idle", name))
+               syme->skip = 1;
+
+       if (filter_match == 1) {
+               filter_end = sym->start;
+               filter_match = -1;
+               if (filter_end - filter_start > 10000) {
+                       fprintf(stderr,
+                               "hm, too large filter symbol <%s> - skipping.\n",
+                               sym_filter);
+                       fprintf(stderr, "symbol filter start: %016lx\n",
+                               filter_start);
+                       fprintf(stderr, "                end: %016lx\n",
+                               filter_end);
+                       filter_end = filter_start = 0;
+                       sym_filter = NULL;
+                       sleep(1);
+               }
+       }
+
+       if (filter_match == 0 && sym_filter && !strcmp(name, sym_filter)) {
+               filter_match = 1;
+               filter_start = sym->start;
+       }
+
+
+       return 0;
+}
+
+static int parse_symbols(void)
+{
+       struct rb_node *node;
+       struct symbol  *sym;
+
+       kernel_dso = dso__new("[kernel]", sizeof(struct sym_entry));
+       if (kernel_dso == NULL)
+               return -1;
+
+       if (dso__load_kernel(kernel_dso, NULL, symbol_filter, 1) != 0)
+               goto out_delete_dso;
+
+       node = rb_first(&kernel_dso->syms);
+       sym = rb_entry(node, struct symbol, rb_node);
+       min_ip = sym->start;
+
+       node = rb_last(&kernel_dso->syms);
+       sym = rb_entry(node, struct symbol, rb_node);
+       max_ip = sym->end;
+
+       if (dump_symtab)
+               dso__fprintf(kernel_dso, stderr);
+
+       return 0;
+
+out_delete_dso:
+       dso__delete(kernel_dso);
+       kernel_dso = NULL;
+       return -1;
+}
+
+#define TRACE_COUNT     3
+
+/*
+ * Binary search in the histogram table and record the hit:
+ */
+static void record_ip(__u64 ip, int counter)
+{
+       struct symbol *sym = dso__find_symbol(kernel_dso, ip);
+
+       if (sym != NULL) {
+               struct sym_entry *syme = dso__sym_priv(kernel_dso, sym);
+
+               if (!syme->skip) {
+                       syme->count[counter]++;
+                       pthread_mutex_lock(&active_symbols_lock);
+                       if (list_empty(&syme->node) || !syme->node.next)
+                               __list_insert_active_sym(syme);
+                       pthread_mutex_unlock(&active_symbols_lock);
+                       return;
+               }
+       }
+
+       samples--;
+}
+
+static void process_event(__u64 ip, int counter)
+{
+       samples++;
+
+       if (ip < min_ip || ip > max_ip) {
+               userspace_samples++;
+               return;
+       }
+
+       record_ip(ip, counter);
+}
+
+struct mmap_data {
+       int                     counter;
+       void                    *base;
+       unsigned int            mask;
+       unsigned int            prev;
+};
+
+static unsigned int mmap_read_head(struct mmap_data *md)
+{
+       struct perf_counter_mmap_page *pc = md->base;
+       int head;
+
+       head = pc->data_head;
+       rmb();
+
+       return head;
+}
+
+struct timeval last_read, this_read;
+
+static void mmap_read_counter(struct mmap_data *md)
+{
+       unsigned int head = mmap_read_head(md);
+       unsigned int old = md->prev;
+       unsigned char *data = md->base + page_size;
+       int diff;
+
+       gettimeofday(&this_read, NULL);
+
+       /*
+        * If we're further behind than half the buffer, there's a chance
+        * the writer will bite our tail and mess up the samples under us.
+        *
+        * If we somehow ended up ahead of the head, we got messed up.
+        *
+        * In either case, truncate and restart at head.
+        */
+       diff = head - old;
+       if (diff > md->mask / 2 || diff < 0) {
+               struct timeval iv;
+               unsigned long msecs;
+
+               timersub(&this_read, &last_read, &iv);
+               msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
+
+               fprintf(stderr, "WARNING: failed to keep up with mmap data."
+                               "  Last read %lu msecs ago.\n", msecs);
+
+               /*
+                * head points to a known good entry, start there.
+                */
+               old = head;
+       }
+
+       last_read = this_read;
+
+       for (; old != head;) {
+               struct ip_event {
+                       struct perf_event_header header;
+                       __u64 ip;
+                       __u32 pid, target_pid;
+               };
+               struct mmap_event {
+                       struct perf_event_header header;
+                       __u32 pid, target_pid;
+                       __u64 start;
+                       __u64 len;
+                       __u64 pgoff;
+                       char filename[PATH_MAX];
+               };
+
+               typedef union event_union {
+                       struct perf_event_header header;
+                       struct ip_event ip;
+                       struct mmap_event mmap;
+               } event_t;
+
+               event_t *event = (event_t *)&data[old & md->mask];
+
+               event_t event_copy;
+
+               size_t size = event->header.size;
+
+               /*
+                * Event straddles the mmap boundary -- header should always
+                * be inside due to u64 alignment of output.
+                */
+               if ((old & md->mask) + size != ((old + size) & md->mask)) {
+                       unsigned int offset = old;
+                       unsigned int len = min(sizeof(*event), size), cpy;
+                       void *dst = &event_copy;
+
+                       do {
+                               cpy = min(md->mask + 1 - (offset & md->mask), len);
+                               memcpy(dst, &data[offset & md->mask], cpy);
+                               offset += cpy;
+                               dst += cpy;
+                               len -= cpy;
+                       } while (len);
+
+                       event = &event_copy;
+               }
+
+               old += size;
+
+               if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
+                       if (event->header.type & PERF_SAMPLE_IP)
+                               process_event(event->ip.ip, md->counter);
+               }
+       }
+
+       md->prev = old;
+}
+
+static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
+static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
+
+static void mmap_read(void)
+{
+       int i, counter;
+
+       for (i = 0; i < nr_cpus; i++) {
+               for (counter = 0; counter < nr_counters; counter++)
+                       mmap_read_counter(&mmap_array[i][counter]);
+       }
+}
+
+int nr_poll;
+int group_fd;
+
+static void start_counter(int i, int counter)
+{
+       struct perf_counter_attr *attr;
+       unsigned int cpu;
+
+       cpu = profile_cpu;
+       if (target_pid == -1 && profile_cpu == -1)
+               cpu = i;
+
+       attr = attrs + counter;
+
+       attr->sample_type       = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+       attr->freq              = freq;
+
+try_again:
+       fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0);
+
+       if (fd[i][counter] < 0) {
+               int err = errno;
+
+               if (err == EPERM)
+                       die("No permission - are you root?\n");
+               /*
+                * If it's cycles then fall back to hrtimer
+                * based cpu-clock-tick sw counter, which
+                * is always available even if no PMU support:
+                */
+               if (attr->type == PERF_TYPE_HARDWARE
+                       && attr->config == PERF_COUNT_HW_CPU_CYCLES) {
+
+                       if (verbose)
+                               warning(" ... trying to fall back to cpu-clock-ticks\n");
+
+                       attr->type = PERF_TYPE_SOFTWARE;
+                       attr->config = PERF_COUNT_SW_CPU_CLOCK;
+                       goto try_again;
+               }
+               printf("\n");
+               error("perfcounter syscall returned with %d (%s)\n",
+                       fd[i][counter], strerror(err));
+               die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
+               exit(-1);
+       }
+       assert(fd[i][counter] >= 0);
+       fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
+
+       /*
+        * First counter acts as the group leader:
+        */
+       if (group && group_fd == -1)
+               group_fd = fd[i][counter];
+
+       event_array[nr_poll].fd = fd[i][counter];
+       event_array[nr_poll].events = POLLIN;
+       nr_poll++;
+
+       mmap_array[i][counter].counter = counter;
+       mmap_array[i][counter].prev = 0;
+       mmap_array[i][counter].mask = mmap_pages*page_size - 1;
+       mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
+                       PROT_READ, MAP_SHARED, fd[i][counter], 0);
+       if (mmap_array[i][counter].base == MAP_FAILED)
+               die("failed to mmap with %d (%s)\n", errno, strerror(errno));
+}
+
+static int __cmd_top(void)
+{
+       pthread_t thread;
+       int i, counter;
+       int ret;
+
+       for (i = 0; i < nr_cpus; i++) {
+               group_fd = -1;
+               for (counter = 0; counter < nr_counters; counter++)
+                       start_counter(i, counter);
+       }
+
+       /* Wait for a minimal set of events before starting the snapshot */
+       poll(event_array, nr_poll, 100);
+
+       mmap_read();
+
+       if (pthread_create(&thread, NULL, display_thread, NULL)) {
+               printf("Could not create display thread.\n");
+               exit(-1);
+       }
+
+       if (realtime_prio) {
+               struct sched_param param;
+
+               param.sched_priority = realtime_prio;
+               if (sched_setscheduler(0, SCHED_FIFO, &param)) {
+                       printf("Could not set realtime priority.\n");
+                       exit(-1);
+               }
+       }
+
+       while (1) {
+               int hits = samples;
+
+               mmap_read();
+
+               if (hits == samples)
+                       ret = poll(event_array, nr_poll, 100);
+       }
+
+       return 0;
+}
+
+static const char * const top_usage[] = {
+       "perf top [<options>]",
+       NULL
+};
+
+static const struct option options[] = {
+       OPT_CALLBACK('e', "event", NULL, "event",
+                    "event selector. use 'perf list' to list available events",
+                    parse_events),
+       OPT_INTEGER('c', "count", &default_interval,
+                   "event period to sample"),
+       OPT_INTEGER('p', "pid", &target_pid,
+                   "profile events on existing pid"),
+       OPT_BOOLEAN('a', "all-cpus", &system_wide,
+                           "system-wide collection from all CPUs"),
+       OPT_INTEGER('C', "CPU", &profile_cpu,
+                   "CPU to profile on"),
+       OPT_INTEGER('m', "mmap-pages", &mmap_pages,
+                   "number of mmap data pages"),
+       OPT_INTEGER('r', "realtime", &realtime_prio,
+                   "collect data with this RT SCHED_FIFO priority"),
+       OPT_INTEGER('d', "delay", &delay_secs,
+                   "number of seconds to delay between refreshes"),
+       OPT_BOOLEAN('D', "dump-symtab", &dump_symtab,
+                           "dump the symbol table used for profiling"),
+       OPT_INTEGER('f', "count-filter", &count_filter,
+                   "only display functions with more events than this"),
+       OPT_BOOLEAN('g', "group", &group,
+                           "put the counters into a counter group"),
+       OPT_STRING('s', "sym-filter", &sym_filter, "pattern",
+                   "only display symbols matchig this pattern"),
+       OPT_BOOLEAN('z', "zero", &group,
+                   "zero history across updates"),
+       OPT_INTEGER('F', "freq", &freq,
+                   "profile at this frequency"),
+       OPT_INTEGER('E', "entries", &print_entries,
+                   "display this many functions"),
+       OPT_BOOLEAN('v', "verbose", &verbose,
+                   "be more verbose (show counter open errors, etc)"),
+       OPT_END()
+};
+
+int cmd_top(int argc, const char **argv, const char *prefix)
+{
+       int counter;
+
+       page_size = sysconf(_SC_PAGE_SIZE);
+
+       argc = parse_options(argc, argv, options, top_usage, 0);
+       if (argc)
+               usage_with_options(top_usage, options);
+
+       if (freq) {
+               default_interval = freq;
+               freq = 1;
+       }
+
+       /* CPU and PID are mutually exclusive */
+       if (target_pid != -1 && profile_cpu != -1) {
+               printf("WARNING: PID switch overriding CPU\n");
+               sleep(1);
+               profile_cpu = -1;
+       }
+
+       if (!nr_counters)
+               nr_counters = 1;
+
+       if (delay_secs < 1)
+               delay_secs = 1;
+
+       parse_symbols();
+
+       /*
+        * Fill in the ones not specifically initialized via -c:
+        */
+       for (counter = 0; counter < nr_counters; counter++) {
+               if (attrs[counter].sample_period)
+                       continue;
+
+               attrs[counter].sample_period = default_interval;
+       }
+
+       nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+       assert(nr_cpus <= MAX_NR_CPUS);
+       assert(nr_cpus >= 0);
+
+       if (target_pid != -1 || profile_cpu != -1)
+               nr_cpus = 1;
+
+       return __cmd_top();
+}
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h

new file mode 100644 (file)

index 0000000..51d1682
--- /dev/null
+++ b/tools/perf/builtin.h
@@ -0,0 +1,26 @@
+#ifndef BUILTIN_H
+#define BUILTIN_H
+
+#include "util/util.h"
+#include "util/strbuf.h"
+
+extern const char perf_version_string[];
+extern const char perf_usage_string[];
+extern const char perf_more_info_string[];
+
+extern void list_common_cmds_help(void);
+extern const char *help_unknown_cmd(const char *cmd);
+extern void prune_packed_objects(int);
+extern int read_line_with_nul(char *buf, int size, FILE *file);
+extern int check_pager_config(const char *cmd);
+
+extern int cmd_annotate(int argc, const char **argv, const char *prefix);
+extern int cmd_help(int argc, const char **argv, const char *prefix);
+extern int cmd_record(int argc, const char **argv, const char *prefix);
+extern int cmd_report(int argc, const char **argv, const char *prefix);
+extern int cmd_stat(int argc, const char **argv, const char *prefix);
+extern int cmd_top(int argc, const char **argv, const char *prefix);
+extern int cmd_version(int argc, const char **argv, const char *prefix);
+extern int cmd_list(int argc, const char **argv, const char *prefix);
+
+#endif
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt

new file mode 100644 (file)

index 0000000..eebce30
--- /dev/null
+++ b/tools/perf/command-list.txt
@@ -0,0 +1,10 @@
+#
+# List of known perf commands.
+# command name                 category [deprecated] [common]
+#
+perf-annotate                  mainporcelain common
+perf-list                      mainporcelain common
+perf-record                    mainporcelain common
+perf-report                    mainporcelain common
+perf-stat                      mainporcelain common
+perf-top                       mainporcelain common
diff --git a/tools/perf/design.txt b/tools/perf/design.txt

new file mode 100644 (file)

index 0000000..860e116
--- /dev/null
+++ b/tools/perf/design.txt
@@ -0,0 +1,442 @@
+
+Performance Counters for Linux
+------------------------------
+
+Performance counters are special hardware registers available on most modern
+CPUs. These registers count the number of certain types of hw events: such
+as instructions executed, cachemisses suffered, or branches mis-predicted -
+without slowing down the kernel or applications. These registers can also
+trigger interrupts when a threshold number of events have passed - and can
+thus be used to profile the code that runs on that CPU.
+
+The Linux Performance Counter subsystem provides an abstraction of these
+hardware capabilities. It provides per task and per CPU counters, counter
+groups, and it provides event capabilities on top of those.  It
+provides "virtual" 64-bit counters, regardless of the width of the
+underlying hardware counters.
+
+Performance counters are accessed via special file descriptors.
+There's one file descriptor per virtual counter used.
+
+The special file descriptor is opened via the perf_counter_open()
+system call:
+
+   int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
+                            pid_t pid, int cpu, int group_fd,
+                            unsigned long flags);
+
+The syscall returns the new fd. The fd can be used via the normal
+VFS system calls: read() can be used to read the counter, fcntl()
+can be used to set the blocking mode, etc.
+
+Multiple counters can be kept open at a time, and the counters
+can be poll()ed.
+
+When creating a new counter fd, 'perf_counter_hw_event' is:
+
+struct perf_counter_hw_event {
+        /*
+         * The MSB of the config word signifies if the rest contains cpu
+         * specific (raw) counter configuration data, if unset, the next
+         * 7 bits are an event type and the rest of the bits are the event
+         * identifier.
+         */
+        __u64                   config;
+
+        __u64                   irq_period;
+        __u32                   record_type;
+        __u32                   read_format;
+
+        __u64                   disabled       :  1, /* off by default        */
+                                inherit        :  1, /* children inherit it   */
+                                pinned         :  1, /* must always be on PMU */
+                                exclusive      :  1, /* only group on PMU     */
+                                exclude_user   :  1, /* don't count user      */
+                                exclude_kernel :  1, /* ditto kernel          */
+                                exclude_hv     :  1, /* ditto hypervisor      */
+                                exclude_idle   :  1, /* don't count when idle */
+                                mmap           :  1, /* include mmap data     */
+                                munmap         :  1, /* include munmap data   */
+                                comm           :  1, /* include comm data     */
+
+                                __reserved_1   : 52;
+
+        __u32                   extra_config_len;
+        __u32                   wakeup_events;  /* wakeup every n events */
+
+        __u64                   __reserved_2;
+        __u64                   __reserved_3;
+};
+
+The 'config' field specifies what the counter should count.  It
+is divided into 3 bit-fields:
+
+raw_type: 1 bit   (most significant bit)       0x8000_0000_0000_0000
+type:    7 bits  (next most significant)       0x7f00_0000_0000_0000
+event_id: 56 bits (least significant)          0x00ff_ffff_ffff_ffff
+
+If 'raw_type' is 1, then the counter will count a hardware event
+specified by the remaining 63 bits of event_config.  The encoding is
+machine-specific.
+
+If 'raw_type' is 0, then the 'type' field says what kind of counter
+this is, with the following encoding:
+
+enum perf_event_types {
+       PERF_TYPE_HARDWARE              = 0,
+       PERF_TYPE_SOFTWARE              = 1,
+       PERF_TYPE_TRACEPOINT            = 2,
+};
+
+A counter of PERF_TYPE_HARDWARE will count the hardware event
+specified by 'event_id':
+
+/*
+ * Generalized performance counter event types, used by the hw_event.event_id
+ * parameter of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_ids {
+       /*
+        * Common hardware events, generalized by the kernel:
+        */
+       PERF_COUNT_HW_CPU_CYCLES                = 0,
+       PERF_COUNT_HW_INSTRUCTIONS              = 1,
+       PERF_COUNT_HW_CACHE_REFERENCES  = 2,
+       PERF_COUNT_HW_CACHE_MISSES              = 3,
+       PERF_COUNT_HW_BRANCH_INSTRUCTIONS       = 4,
+       PERF_COUNT_HW_BRANCH_MISSES     = 5,
+       PERF_COUNT_HW_BUS_CYCLES                = 6,
+};
+
+These are standardized types of events that work relatively uniformly
+on all CPUs that implement Performance Counters support under Linux,
+although there may be variations (e.g., different CPUs might count
+cache references and misses at different levels of the cache hierarchy).
+If a CPU is not able to count the selected event, then the system call
+will return -EINVAL.
+
+More hw_event_types are supported as well, but they are CPU-specific
+and accessed as raw events.  For example, to count "External bus
+cycles while bus lock signal asserted" events on Intel Core CPUs, pass
+in a 0x4064 event_id value and set hw_event.raw_type to 1.
+
+A counter of type PERF_TYPE_SOFTWARE will count one of the available
+software events, selected by 'event_id':
+
+/*
+ * Special "software" counters provided by the kernel, even if the hardware
+ * does not support performance counters. These counters measure various
+ * physical and sw events of the kernel (and allow the profiling of them as
+ * well):
+ */
+enum sw_event_ids {
+       PERF_COUNT_SW_CPU_CLOCK         = 0,
+       PERF_COUNT_SW_TASK_CLOCK                = 1,
+       PERF_COUNT_SW_PAGE_FAULTS               = 2,
+       PERF_COUNT_SW_CONTEXT_SWITCHES  = 3,
+       PERF_COUNT_SW_CPU_MIGRATIONS    = 4,
+       PERF_COUNT_SW_PAGE_FAULTS_MIN   = 5,
+       PERF_COUNT_SW_PAGE_FAULTS_MAJ   = 6,
+};
+
+Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
+tracer is available, and event_id values can be obtained from
+/debug/tracing/events/*/*/id
+
+
+Counters come in two flavours: counting counters and sampling
+counters.  A "counting" counter is one that is used for counting the
+number of events that occur, and is characterised by having
+irq_period = 0.
+
+
+A read() on a counter returns the current value of the counter and possible
+additional values as specified by 'read_format', each value is a u64 (8 bytes)
+in size.
+
+/*
+ * Bits that can be set in hw_event.read_format to request that
+ * reads on the counter should return the indicated quantities,
+ * in increasing order of bit value, after the counter value.
+ */
+enum perf_counter_read_format {
+        PERF_FORMAT_TOTAL_TIME_ENABLED  =  1,
+        PERF_FORMAT_TOTAL_TIME_RUNNING  =  2,
+};
+
+Using these additional values one can establish the overcommit ratio for a
+particular counter allowing one to take the round-robin scheduling effect
+into account.
+
+
+A "sampling" counter is one that is set up to generate an interrupt
+every N events, where N is given by 'irq_period'.  A sampling counter
+has irq_period > 0. The record_type controls what data is recorded on each
+interrupt:
+
+/*
+ * Bits that can be set in hw_event.record_type to request information
+ * in the overflow packets.
+ */
+enum perf_counter_record_format {
+        PERF_RECORD_IP          = 1U << 0,
+        PERF_RECORD_TID         = 1U << 1,
+        PERF_RECORD_TIME        = 1U << 2,
+        PERF_RECORD_ADDR        = 1U << 3,
+        PERF_RECORD_GROUP       = 1U << 4,
+        PERF_RECORD_CALLCHAIN   = 1U << 5,
+};
+
+Such (and other) events will be recorded in a ring-buffer, which is
+available to user-space using mmap() (see below).
+
+The 'disabled' bit specifies whether the counter starts out disabled
+or enabled.  If it is initially disabled, it can be enabled by ioctl
+or prctl (see below).
+
+The 'inherit' bit, if set, specifies that this counter should count
+events on descendant tasks as well as the task specified.  This only
+applies to new descendents, not to any existing descendents at the
+time the counter is created (nor to any new descendents of existing
+descendents).
+
+The 'pinned' bit, if set, specifies that the counter should always be
+on the CPU if at all possible.  It only applies to hardware counters
+and only to group leaders.  If a pinned counter cannot be put onto the
+CPU (e.g. because there are not enough hardware counters or because of
+a conflict with some other event), then the counter goes into an
+'error' state, where reads return end-of-file (i.e. read() returns 0)
+until the counter is subsequently enabled or disabled.
+
+The 'exclusive' bit, if set, specifies that when this counter's group
+is on the CPU, it should be the only group using the CPU's counters.
+In future, this will allow sophisticated monitoring programs to supply
+extra configuration information via 'extra_config_len' to exploit
+advanced features of the CPU's Performance Monitor Unit (PMU) that are
+not otherwise accessible and that might disrupt other hardware
+counters.
+
+The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
+way to request that counting of events be restricted to times when the
+CPU is in user, kernel and/or hypervisor mode.
+
+The 'mmap' and 'munmap' bits allow recording of PROT_EXEC mmap/munmap
+operations, these can be used to relate userspace IP addresses to actual
+code, even after the mapping (or even the whole process) is gone,
+these events are recorded in the ring-buffer (see below).
+
+The 'comm' bit allows tracking of process comm data on process creation.
+This too is recorded in the ring-buffer (see below).
+
+The 'pid' parameter to the perf_counter_open() system call allows the
+counter to be specific to a task:
+
+ pid == 0: if the pid parameter is zero, the counter is attached to the
+ current task.
+
+ pid > 0: the counter is attached to a specific task (if the current task
+ has sufficient privilege to do so)
+
+ pid < 0: all tasks are counted (per cpu counters)
+
+The 'cpu' parameter allows a counter to be made specific to a CPU:
+
+ cpu >= 0: the counter is restricted to a specific CPU
+ cpu == -1: the counter counts on all CPUs
+
+(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.)
+
+A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
+events of that task and 'follows' that task to whatever CPU the task
+gets schedule to. Per task counters can be created by any user, for
+their own tasks.
+
+A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
+all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
+
+The 'flags' parameter is currently unused and must be zero.
+
+The 'group_fd' parameter allows counter "groups" to be set up.  A
+counter group has one counter which is the group "leader".  The leader
+is created first, with group_fd = -1 in the perf_counter_open call
+that creates it.  The rest of the group members are created
+subsequently, with group_fd giving the fd of the group leader.
+(A single counter on its own is created with group_fd = -1 and is
+considered to be a group with only 1 member.)
+
+A counter group is scheduled onto the CPU as a unit, that is, it will
+only be put onto the CPU if all of the counters in the group can be
+put onto the CPU.  This means that the values of the member counters
+can be meaningfully compared, added, divided (to get ratios), etc.,
+with each other, since they have counted events for the same set of
+executed instructions.
+
+
+Like stated, asynchronous events, like counter overflow or PROT_EXEC mmap
+tracking are logged into a ring-buffer. This ring-buffer is created and
+accessed through mmap().
+
+The mmap size should be 1+2^n pages, where the first page is a meta-data page
+(struct perf_counter_mmap_page) that contains various bits of information such
+as where the ring-buffer head is.
+
+/*
+ * Structure of the page that can be mapped via mmap
+ */
+struct perf_counter_mmap_page {
+        __u32   version;                /* version number of this structure */
+        __u32   compat_version;         /* lowest version this is compat with */
+
+        /*
+         * Bits needed to read the hw counters in user-space.
+         *
+         *   u32 seq;
+         *   s64 count;
+         *
+         *   do {
+         *     seq = pc->lock;
+         *
+         *     barrier()
+         *     if (pc->index) {
+         *       count = pmc_read(pc->index - 1);
+         *       count += pc->offset;
+         *     } else
+         *       goto regular_read;
+         *
+         *     barrier();
+         *   } while (pc->lock != seq);
+         *
+         * NOTE: for obvious reason this only works on self-monitoring
+         *       processes.
+         */
+        __u32   lock;                   /* seqlock for synchronization */
+        __u32   index;                  /* hardware counter identifier */
+        __s64   offset;                 /* add to hardware counter value */
+
+        /*
+         * Control data for the mmap() data buffer.
+         *
+         * User-space reading this value should issue an rmb(), on SMP capable
+         * platforms, after reading this value -- see perf_counter_wakeup().
+         */
+        __u32   data_head;              /* head in the data section */
+};
+
+NOTE: the hw-counter userspace bits are arch specific and are currently only
+      implemented on powerpc.
+
+The following 2^n pages are the ring-buffer which contains events of the form:
+
+#define PERF_EVENT_MISC_KERNEL          (1 << 0)
+#define PERF_EVENT_MISC_USER            (1 << 1)
+#define PERF_EVENT_MISC_OVERFLOW        (1 << 2)
+
+struct perf_event_header {
+        __u32   type;
+        __u16   misc;
+        __u16   size;
+};
+
+enum perf_event_type {
+
+        /*
+         * The MMAP events record the PROT_EXEC mappings so that we can
+         * correlate userspace IPs to code. They have the following structure:
+         *
+         * struct {
+         *      struct perf_event_header        header;
+         *
+         *      u32                             pid, tid;
+         *      u64                             addr;
+         *      u64                             len;
+         *      u64                             pgoff;
+         *      char                            filename[];
+         * };
+         */
+        PERF_EVENT_MMAP                 = 1,
+        PERF_EVENT_MUNMAP               = 2,
+
+        /*
+         * struct {
+         *      struct perf_event_header        header;
+         *
+         *      u32                             pid, tid;
+         *      char                            comm[];
+         * };
+         */
+        PERF_EVENT_COMM                 = 3,
+
+        /*
+         * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
+         * will be PERF_RECORD_*
+         *
+         * struct {
+         *      struct perf_event_header        header;
+         *
+         *      { u64                   ip;       } && PERF_RECORD_IP
+         *      { u32                   pid, tid; } && PERF_RECORD_TID
+         *      { u64                   time;     } && PERF_RECORD_TIME
+         *      { u64                   addr;     } && PERF_RECORD_ADDR
+         *
+         *      { u64                   nr;
+         *        { u64 event, val; }   cnt[nr];  } && PERF_RECORD_GROUP
+         *
+         *      { u16                   nr,
+         *                              hv,
+         *                              kernel,
+         *                              user;
+         *        u64                   ips[nr];  } && PERF_RECORD_CALLCHAIN
+         * };
+         */
+};
+
+NOTE: PERF_RECORD_CALLCHAIN is arch specific and currently only implemented
+      on x86.
+
+Notification of new events is possible through poll()/select()/epoll() and
+fcntl() managing signals.
+
+Normally a notification is generated for every page filled, however one can
+additionally set perf_counter_hw_event.wakeup_events to generate one every
+so many counter overflow events.
+
+Future work will include a splice() interface to the ring-buffer.
+
+
+Counters can be enabled and disabled in two ways: via ioctl and via
+prctl.  When a counter is disabled, it doesn't count or generate
+events but does continue to exist and maintain its count value.
+
+An individual counter or counter group can be enabled with
+
+       ioctl(fd, PERF_COUNTER_IOC_ENABLE);
+
+or disabled with
+
+       ioctl(fd, PERF_COUNTER_IOC_DISABLE);
+
+Enabling or disabling the leader of a group enables or disables the
+whole group; that is, while the group leader is disabled, none of the
+counters in the group will count.  Enabling or disabling a member of a
+group other than the leader only affects that counter - disabling an
+non-leader stops that counter from counting but doesn't affect any
+other counter.
+
+Additionally, non-inherited overflow counters can use
+
+       ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr);
+
+to enable a counter for 'nr' events, after which it gets disabled again.
+
+A process can enable or disable all the counter groups that are
+attached to it, using prctl:
+
+       prctl(PR_TASK_PERF_COUNTERS_ENABLE);
+
+       prctl(PR_TASK_PERF_COUNTERS_DISABLE);
+
+This applies to all counters on the current process, whether created
+by this process or by another, and doesn't affect any counters that
+this process has created on other processes.  It only enables or
+disables the group leaders, not any other members in the groups.
+
diff --git a/tools/perf/perf.c b/tools/perf/perf.c

new file mode 100644 (file)

index 0000000..4eb7259
--- /dev/null
+++ b/tools/perf/perf.c
@@ -0,0 +1,428 @@
+/*
+ * perf.c
+ *
+ * Performance analysis utility.
+ *
+ * This is the main hub from which the sub-commands (perf stat,
+ * perf top, perf record, perf report, etc.) are started.
+ */
+#include "builtin.h"
+
+#include "util/exec_cmd.h"
+#include "util/cache.h"
+#include "util/quote.h"
+#include "util/run-command.h"
+
+const char perf_usage_string[] =
+       "perf [--version] [--help] COMMAND [ARGS]";
+
+const char perf_more_info_string[] =
+       "See 'perf help COMMAND' for more information on a specific command.";
+
+static int use_pager = -1;
+struct pager_config {
+       const char *cmd;
+       int val;
+};
+
+static int pager_command_config(const char *var, const char *value, void *data)
+{
+       struct pager_config *c = data;
+       if (!prefixcmp(var, "pager.") && !strcmp(var + 6, c->cmd))
+               c->val = perf_config_bool(var, value);
+       return 0;
+}
+
+/* returns 0 for "no pager", 1 for "use pager", and -1 for "not specified" */
+int check_pager_config(const char *cmd)
+{
+       struct pager_config c;
+       c.cmd = cmd;
+       c.val = -1;
+       perf_config(pager_command_config, &c);
+       return c.val;
+}
+
+static void commit_pager_choice(void) {
+       switch (use_pager) {
+       case 0:
+               setenv("PERF_PAGER", "cat", 1);
+               break;
+       case 1:
+               /* setup_pager(); */
+               break;
+       default:
+               break;
+       }
+}
+
+static int handle_options(const char*** argv, int* argc, int* envchanged)
+{
+       int handled = 0;
+
+       while (*argc > 0) {
+               const char *cmd = (*argv)[0];
+               if (cmd[0] != '-')
+                       break;
+
+               /*
+                * For legacy reasons, the "version" and "help"
+                * commands can be written with "--" prepended
+                * to make them look like flags.
+                */
+               if (!strcmp(cmd, "--help") || !strcmp(cmd, "--version"))
+                       break;
+
+               /*
+                * Check remaining flags.
+                */
+               if (!prefixcmp(cmd, "--exec-path")) {
+                       cmd += 11;
+                       if (*cmd == '=')
+                               perf_set_argv_exec_path(cmd + 1);
+                       else {
+                               puts(perf_exec_path());
+                               exit(0);
+                       }
+               } else if (!strcmp(cmd, "--html-path")) {
+                       puts(system_path(PERF_HTML_PATH));
+                       exit(0);
+               } else if (!strcmp(cmd, "-p") || !strcmp(cmd, "--paginate")) {
+                       use_pager = 1;
+               } else if (!strcmp(cmd, "--no-pager")) {
+                       use_pager = 0;
+                       if (envchanged)
+                               *envchanged = 1;
+               } else if (!strcmp(cmd, "--perf-dir")) {
+                       if (*argc < 2) {
+                               fprintf(stderr, "No directory given for --perf-dir.\n" );
+                               usage(perf_usage_string);
+                       }
+                       setenv(PERF_DIR_ENVIRONMENT, (*argv)[1], 1);
+                       if (envchanged)
+                               *envchanged = 1;
+                       (*argv)++;
+                       (*argc)--;
+                       handled++;
+               } else if (!prefixcmp(cmd, "--perf-dir=")) {
+                       setenv(PERF_DIR_ENVIRONMENT, cmd + 10, 1);
+                       if (envchanged)
+                               *envchanged = 1;
+               } else if (!strcmp(cmd, "--work-tree")) {
+                       if (*argc < 2) {
+                               fprintf(stderr, "No directory given for --work-tree.\n" );
+                               usage(perf_usage_string);
+                       }
+                       setenv(PERF_WORK_TREE_ENVIRONMENT, (*argv)[1], 1);
+                       if (envchanged)
+                               *envchanged = 1;
+                       (*argv)++;
+                       (*argc)--;
+               } else if (!prefixcmp(cmd, "--work-tree=")) {
+                       setenv(PERF_WORK_TREE_ENVIRONMENT, cmd + 12, 1);
+                       if (envchanged)
+                               *envchanged = 1;
+               } else {
+                       fprintf(stderr, "Unknown option: %s\n", cmd);
+                       usage(perf_usage_string);
+               }
+
+               (*argv)++;
+               (*argc)--;
+               handled++;
+       }
+       return handled;
+}
+
+static int handle_alias(int *argcp, const char ***argv)
+{
+       int envchanged = 0, ret = 0, saved_errno = errno;
+       int count, option_count;
+       const char** new_argv;
+       const char *alias_command;
+       char *alias_string;
+
+       alias_command = (*argv)[0];
+       alias_string = alias_lookup(alias_command);
+       if (alias_string) {
+               if (alias_string[0] == '!') {
+                       if (*argcp > 1) {
+                               struct strbuf buf;
+
+                               strbuf_init(&buf, PATH_MAX);
+                               strbuf_addstr(&buf, alias_string);
+                               sq_quote_argv(&buf, (*argv) + 1, PATH_MAX);
+                               free(alias_string);
+                               alias_string = buf.buf;
+                       }
+                       ret = system(alias_string + 1);
+                       if (ret >= 0 && WIFEXITED(ret) &&
+                           WEXITSTATUS(ret) != 127)
+                               exit(WEXITSTATUS(ret));
+                       die("Failed to run '%s' when expanding alias '%s'",
+                           alias_string + 1, alias_command);
+               }
+               count = split_cmdline(alias_string, &new_argv);
+               if (count < 0)
+                       die("Bad alias.%s string", alias_command);
+               option_count = handle_options(&new_argv, &count, &envchanged);
+               if (envchanged)
+                       die("alias '%s' changes environment variables\n"
+                                "You can use '!perf' in the alias to do this.",
+                                alias_command);
+               memmove(new_argv - option_count, new_argv,
+                               count * sizeof(char *));
+               new_argv -= option_count;
+
+               if (count < 1)
+                       die("empty alias for %s", alias_command);
+
+               if (!strcmp(alias_command, new_argv[0]))
+                       die("recursive alias: %s", alias_command);
+
+               new_argv = realloc(new_argv, sizeof(char*) *
+                                   (count + *argcp + 1));
+               /* insert after command name */
+               memcpy(new_argv + count, *argv + 1, sizeof(char*) * *argcp);
+               new_argv[count+*argcp] = NULL;
+
+               *argv = new_argv;
+               *argcp += count - 1;
+
+               ret = 1;
+       }
+
+       errno = saved_errno;
+
+       return ret;
+}
+
+const char perf_version_string[] = PERF_VERSION;
+
+#define RUN_SETUP      (1<<0)
+#define USE_PAGER      (1<<1)
+/*
+ * require working tree to be present -- anything uses this needs
+ * RUN_SETUP for reading from the configuration file.
+ */
+#define NEED_WORK_TREE (1<<2)
+
+struct cmd_struct {
+       const char *cmd;
+       int (*fn)(int, const char **, const char *);
+       int option;
+};
+
+static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
+{
+       int status;
+       struct stat st;
+       const char *prefix;
+
+       prefix = NULL;
+       if (p->option & RUN_SETUP)
+               prefix = NULL; /* setup_perf_directory(); */
+
+       if (use_pager == -1 && p->option & RUN_SETUP)
+               use_pager = check_pager_config(p->cmd);
+       if (use_pager == -1 && p->option & USE_PAGER)
+               use_pager = 1;
+       commit_pager_choice();
+
+       if (p->option & NEED_WORK_TREE)
+               /* setup_work_tree() */;
+
+       status = p->fn(argc, argv, prefix);
+       if (status)
+               return status & 0xff;
+
+       /* Somebody closed stdout? */
+       if (fstat(fileno(stdout), &st))
+               return 0;
+       /* Ignore write errors for pipes and sockets.. */
+       if (S_ISFIFO(st.st_mode) || S_ISSOCK(st.st_mode))
+               return 0;
+
+       /* Check for ENOSPC and EIO errors.. */
+       if (fflush(stdout))
+               die("write failure on standard output: %s", strerror(errno));
+       if (ferror(stdout))
+               die("unknown write failure on standard output");
+       if (fclose(stdout))
+               die("close failed on standard output: %s", strerror(errno));
+       return 0;
+}
+
+static void handle_internal_command(int argc, const char **argv)
+{
+       const char *cmd = argv[0];
+       static struct cmd_struct commands[] = {
+               { "help", cmd_help, 0 },
+               { "list", cmd_list, 0 },
+               { "record", cmd_record, 0 },
+               { "report", cmd_report, 0 },
+               { "stat", cmd_stat, 0 },
+               { "top", cmd_top, 0 },
+               { "annotate", cmd_annotate, 0 },
+               { "version", cmd_version, 0 },
+       };
+       int i;
+       static const char ext[] = STRIP_EXTENSION;
+
+       if (sizeof(ext) > 1) {
+               i = strlen(argv[0]) - strlen(ext);
+               if (i > 0 && !strcmp(argv[0] + i, ext)) {
+                       char *argv0 = strdup(argv[0]);
+                       argv[0] = cmd = argv0;
+                       argv0[i] = '\0';
+               }
+       }
+
+       /* Turn "perf cmd --help" into "perf help cmd" */
+       if (argc > 1 && !strcmp(argv[1], "--help")) {
+               argv[1] = argv[0];
+               argv[0] = cmd = "help";
+       }
+
+       for (i = 0; i < ARRAY_SIZE(commands); i++) {
+               struct cmd_struct *p = commands+i;
+               if (strcmp(p->cmd, cmd))
+                       continue;
+               exit(run_builtin(p, argc, argv));
+       }
+}
+
+static void execv_dashed_external(const char **argv)
+{
+       struct strbuf cmd = STRBUF_INIT;
+       const char *tmp;
+       int status;
+
+       strbuf_addf(&cmd, "perf-%s", argv[0]);
+
+       /*
+        * argv[0] must be the perf command, but the argv array
+        * belongs to the caller, and may be reused in
+        * subsequent loop iterations. Save argv[0] and
+        * restore it on error.
+        */
+       tmp = argv[0];
+       argv[0] = cmd.buf;
+
+       /*
+        * if we fail because the command is not found, it is
+        * OK to return. Otherwise, we just pass along the status code.
+        */
+       status = run_command_v_opt(argv, 0);
+       if (status != -ERR_RUN_COMMAND_EXEC) {
+               if (IS_RUN_COMMAND_ERR(status))
+                       die("unable to run '%s'", argv[0]);
+               exit(-status);
+       }
+       errno = ENOENT; /* as if we called execvp */
+
+       argv[0] = tmp;
+
+       strbuf_release(&cmd);
+}
+
+static int run_argv(int *argcp, const char ***argv)
+{
+       int done_alias = 0;
+
+       while (1) {
+               /* See if it's an internal command */
+               handle_internal_command(*argcp, *argv);
+
+               /* .. then try the external ones */
+               execv_dashed_external(*argv);
+
+               /* It could be an alias -- this works around the insanity
+                * of overriding "perf log" with "perf show" by having
+                * alias.log = show
+                */
+               if (done_alias || !handle_alias(argcp, argv))
+                       break;
+               done_alias = 1;
+       }
+
+       return done_alias;
+}
+
+
+int main(int argc, const char **argv)
+{
+       const char *cmd;
+
+       cmd = perf_extract_argv0_path(argv[0]);
+       if (!cmd)
+               cmd = "perf-help";
+
+       /*
+        * "perf-xxxx" is the same as "perf xxxx", but we obviously:
+        *
+        *  - cannot take flags in between the "perf" and the "xxxx".
+        *  - cannot execute it externally (since it would just do
+        *    the same thing over again)
+        *
+        * So we just directly call the internal command handler, and
+        * die if that one cannot handle it.
+        */
+       if (!prefixcmp(cmd, "perf-")) {
+               cmd += 5;
+               argv[0] = cmd;
+               handle_internal_command(argc, argv);
+               die("cannot handle %s internally", cmd);
+       }
+
+       /* Look for flags.. */
+       argv++;
+       argc--;
+       handle_options(&argv, &argc, NULL);
+       commit_pager_choice();
+       if (argc > 0) {
+               if (!prefixcmp(argv[0], "--"))
+                       argv[0] += 2;
+       } else {
+               /* The user didn't specify a command; give them help */
+               printf("\n usage: %s\n\n", perf_usage_string);
+               list_common_cmds_help();
+               printf("\n %s\n\n", perf_more_info_string);
+               exit(1);
+       }
+       cmd = argv[0];
+
+       /*
+        * We use PATH to find perf commands, but we prepend some higher
+        * precidence paths: the "--exec-path" option, the PERF_EXEC_PATH
+        * environment, and the $(perfexecdir) from the Makefile at build
+        * time.
+        */
+       setup_path();
+
+       while (1) {
+               static int done_help = 0;
+               static int was_alias = 0;
+
+               was_alias = run_argv(&argc, &argv);
+               if (errno != ENOENT)
+                       break;
+
+               if (was_alias) {
+                       fprintf(stderr, "Expansion of alias '%s' failed; "
+                               "'%s' is not a perf-command\n",
+                               cmd, argv[0]);
+                       exit(1);
+               }
+               if (!done_help) {
+                       cmd = argv[0] = help_unknown_cmd(cmd);
+                       done_help = 1;
+               } else
+                       break;
+       }
+
+       fprintf(stderr, "Failed to run command '%s': %s\n",
+               cmd, strerror(errno));
+
+       return 1;
+}
diff --git a/tools/perf/perf.h b/tools/perf/perf.h

new file mode 100644 (file)

index 0000000..af0a504
--- /dev/null
+++ b/tools/perf/perf.h
@@ -0,0 +1,67 @@
+#ifndef _PERF_PERF_H
+#define _PERF_PERF_H
+
+#if defined(__x86_64__) || defined(__i386__)
+#include "../../arch/x86/include/asm/unistd.h"
+#define rmb()          asm volatile("lfence" ::: "memory")
+#define cpu_relax()    asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __powerpc__
+#include "../../arch/powerpc/include/asm/unistd.h"
+#define rmb()          asm volatile ("sync" ::: "memory")
+#define cpu_relax()    asm volatile ("" ::: "memory");
+#endif
+
+#include <time.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+
+#include "../../include/linux/perf_counter.h"
+
+/*
+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * counters in the current task.
+ */
+#define PR_TASK_PERF_COUNTERS_DISABLE   31
+#define PR_TASK_PERF_COUNTERS_ENABLE    32
+
+#ifndef NSEC_PER_SEC
+# define NSEC_PER_SEC                  1000000000ULL
+#endif
+
+static inline unsigned long long rdclock(void)
+{
+       struct timespec ts;
+
+       clock_gettime(CLOCK_MONOTONIC, &ts);
+       return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+}
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+#define unlikely(x)    __builtin_expect(!!(x), 0)
+#define min(x, y) ({                           \
+       typeof(x) _min1 = (x);                  \
+       typeof(y) _min2 = (y);                  \
+       (void) (&_min1 == &_min2);              \
+       _min1 < _min2 ? _min1 : _min2; })
+
+static inline int
+sys_perf_counter_open(struct perf_counter_attr *attr_uptr,
+                     pid_t pid, int cpu, int group_fd,
+                     unsigned long flags)
+{
+       return syscall(__NR_perf_counter_open, attr_uptr, pid, cpu,
+                      group_fd, flags);
+}
+
+#define MAX_COUNTERS                   256
+#define MAX_NR_CPUS                    256
+
+#endif
diff --git a/tools/perf/util/PERF-VERSION-GEN b/tools/perf/util/PERF-VERSION-GEN

new file mode 100755 (executable)

index 0000000..c561d15
--- /dev/null
+++ b/tools/perf/util/PERF-VERSION-GEN
@@ -0,0 +1,42 @@
+#!/bin/sh
+
+GVF=PERF-VERSION-FILE
+DEF_VER=v0.0.1.PERF
+
+LF='
+'
+
+# First see if there is a version file (included in release tarballs),
+# then try git-describe, then default.
+if test -f version
+then
+       VN=$(cat version) || VN="$DEF_VER"
+elif test -d .git -o -f .git &&
+       VN=$(git describe --abbrev=4 HEAD 2>/dev/null) &&
+       case "$VN" in
+       *$LF*) (exit 1) ;;
+       v[0-9]*)
+               git update-index -q --refresh
+               test -z "$(git diff-index --name-only HEAD --)" ||
+               VN="$VN-dirty" ;;
+       esac
+then
+       VN=$(echo "$VN" | sed -e 's/-/./g');
+else
+       VN="$DEF_VER"
+fi
+
+VN=$(expr "$VN" : v*'\(.*\)')
+
+if test -r $GVF
+then
+       VC=$(sed -e 's/^PERF_VERSION = //' <$GVF)
+else
+       VC=unset
+fi
+test "$VN" = "$VC" || {
+       echo >&2 "PERF_VERSION = $VN"
+       echo "PERF_VERSION = $VN" >$GVF
+}
+
+
diff --git a/tools/perf/util/abspath.c b/tools/perf/util/abspath.c

new file mode 100644 (file)

index 0000000..61d33b8
--- /dev/null
+++ b/tools/perf/util/abspath.c
@@ -0,0 +1,117 @@
+#include "cache.h"
+
+/*
+ * Do not use this for inspecting *tracked* content.  When path is a
+ * symlink to a directory, we do not want to say it is a directory when
+ * dealing with tracked content in the working tree.
+ */
+static int is_directory(const char *path)
+{
+       struct stat st;
+       return (!stat(path, &st) && S_ISDIR(st.st_mode));
+}
+
+/* We allow "recursive" symbolic links. Only within reason, though. */
+#define MAXDEPTH 5
+
+const char *make_absolute_path(const char *path)
+{
+       static char bufs[2][PATH_MAX + 1], *buf = bufs[0], *next_buf = bufs[1];
+       char cwd[1024] = "";
+       int buf_index = 1, len;
+
+       int depth = MAXDEPTH;
+       char *last_elem = NULL;
+       struct stat st;
+
+       if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
+               die ("Too long path: %.*s", 60, path);
+
+       while (depth--) {
+               if (!is_directory(buf)) {
+                       char *last_slash = strrchr(buf, '/');
+                       if (last_slash) {
+                               *last_slash = '\0';
+                               last_elem = xstrdup(last_slash + 1);
+                       } else {
+                               last_elem = xstrdup(buf);
+                               *buf = '\0';
+                       }
+               }
+
+               if (*buf) {
+                       if (!*cwd && !getcwd(cwd, sizeof(cwd)))
+                               die ("Could not get current working directory");
+
+                       if (chdir(buf))
+                               die ("Could not switch to '%s'", buf);
+               }
+               if (!getcwd(buf, PATH_MAX))
+                       die ("Could not get current working directory");
+
+               if (last_elem) {
+                       int len = strlen(buf);
+                       if (len + strlen(last_elem) + 2 > PATH_MAX)
+                               die ("Too long path name: '%s/%s'",
+                                               buf, last_elem);
+                       buf[len] = '/';
+                       strcpy(buf + len + 1, last_elem);
+                       free(last_elem);
+                       last_elem = NULL;
+               }
+
+               if (!lstat(buf, &st) && S_ISLNK(st.st_mode)) {
+                       len = readlink(buf, next_buf, PATH_MAX);
+                       if (len < 0)
+                               die ("Invalid symlink: %s", buf);
+                       if (PATH_MAX <= len)
+                               die("symbolic link too long: %s", buf);
+                       next_buf[len] = '\0';
+                       buf = next_buf;
+                       buf_index = 1 - buf_index;
+                       next_buf = bufs[buf_index];
+               } else
+                       break;
+       }
+
+       if (*cwd && chdir(cwd))
+               die ("Could not change back to '%s'", cwd);
+
+       return buf;
+}
+
+static const char *get_pwd_cwd(void)
+{
+       static char cwd[PATH_MAX + 1];
+       char *pwd;
+       struct stat cwd_stat, pwd_stat;
+       if (getcwd(cwd, PATH_MAX) == NULL)
+               return NULL;
+       pwd = getenv("PWD");
+       if (pwd && strcmp(pwd, cwd)) {
+               stat(cwd, &cwd_stat);
+               if (!stat(pwd, &pwd_stat) &&
+                   pwd_stat.st_dev == cwd_stat.st_dev &&
+                   pwd_stat.st_ino == cwd_stat.st_ino) {
+                       strlcpy(cwd, pwd, PATH_MAX);
+               }
+       }
+       return cwd;
+}
+
+const char *make_nonrelative_path(const char *path)
+{
+       static char buf[PATH_MAX + 1];
+
+       if (is_absolute_path(path)) {
+               if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
+                       die("Too long path: %.*s", 60, path);
+       } else {
+               const char *cwd = get_pwd_cwd();
+               if (!cwd)
+                       die("Cannot determine the current working directory");
+               if (snprintf(buf, PATH_MAX, "%s/%s", cwd, path) >= PATH_MAX)
+                       die("Too long path: %.*s", 60, path);
+       }
+       return buf;
+}
diff --git a/tools/perf/util/alias.c b/tools/perf/util/alias.c

new file mode 100644 (file)

index 0000000..9b3dd2b
--- /dev/null
+++ b/tools/perf/util/alias.c
@@ -0,0 +1,77 @@
+#include "cache.h"
+
+static const char *alias_key;
+static char *alias_val;
+
+static int alias_lookup_cb(const char *k, const char *v, void *cb)
+{
+       if (!prefixcmp(k, "alias.") && !strcmp(k+6, alias_key)) {
+               if (!v)
+                       return config_error_nonbool(k);
+               alias_val = strdup(v);
+               return 0;
+       }
+       return 0;
+}
+
+char *alias_lookup(const char *alias)
+{
+       alias_key = alias;
+       alias_val = NULL;
+       perf_config(alias_lookup_cb, NULL);
+       return alias_val;
+}
+
+int split_cmdline(char *cmdline, const char ***argv)
+{
+       int src, dst, count = 0, size = 16;
+       char quoted = 0;
+
+       *argv = malloc(sizeof(char*) * size);
+
+       /* split alias_string */
+       (*argv)[count++] = cmdline;
+       for (src = dst = 0; cmdline[src];) {
+               char c = cmdline[src];
+               if (!quoted && isspace(c)) {
+                       cmdline[dst++] = 0;
+                       while (cmdline[++src]
+                                       && isspace(cmdline[src]))
+                               ; /* skip */
+                       if (count >= size) {
+                               size += 16;
+                               *argv = realloc(*argv, sizeof(char*) * size);
+                       }
+                       (*argv)[count++] = cmdline + dst;
+               } else if (!quoted && (c == '\'' || c == '"')) {
+                       quoted = c;
+                       src++;
+               } else if (c == quoted) {
+                       quoted = 0;
+                       src++;
+               } else {
+                       if (c == '\\' && quoted != '\'') {
+                               src++;
+                               c = cmdline[src];
+                               if (!c) {
+                                       free(*argv);
+                                       *argv = NULL;
+                                       return error("cmdline ends with \\");
+                               }
+                       }
+                       cmdline[dst++] = c;
+                       src++;
+               }
+       }
+
+       cmdline[dst] = 0;
+
+       if (quoted) {
+               free(*argv);
+               *argv = NULL;
+               return error("unclosed quote");
+       }
+
+       return count;
+}
+
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h

new file mode 100644 (file)

index 0000000..393d614
--- /dev/null
+++ b/tools/perf/util/cache.h
@@ -0,0 +1,119 @@
+#ifndef CACHE_H
+#define CACHE_H
+
+#include "util.h"
+#include "strbuf.h"
+
+#define PERF_DIR_ENVIRONMENT "PERF_DIR"
+#define PERF_WORK_TREE_ENVIRONMENT "PERF_WORK_TREE"
+#define DEFAULT_PERF_DIR_ENVIRONMENT ".perf"
+#define DB_ENVIRONMENT "PERF_OBJECT_DIRECTORY"
+#define INDEX_ENVIRONMENT "PERF_INDEX_FILE"
+#define GRAFT_ENVIRONMENT "PERF_GRAFT_FILE"
+#define TEMPLATE_DIR_ENVIRONMENT "PERF_TEMPLATE_DIR"
+#define CONFIG_ENVIRONMENT "PERF_CONFIG"
+#define EXEC_PATH_ENVIRONMENT "PERF_EXEC_PATH"
+#define CEILING_DIRECTORIES_ENVIRONMENT "PERF_CEILING_DIRECTORIES"
+#define PERFATTRIBUTES_FILE ".perfattributes"
+#define INFOATTRIBUTES_FILE "info/attributes"
+#define ATTRIBUTE_MACRO_PREFIX "[attr]"
+
+typedef int (*config_fn_t)(const char *, const char *, void *);
+extern int perf_default_config(const char *, const char *, void *);
+extern int perf_config_from_file(config_fn_t fn, const char *, void *);
+extern int perf_config(config_fn_t fn, void *);
+extern int perf_parse_ulong(const char *, unsigned long *);
+extern int perf_config_int(const char *, const char *);
+extern unsigned long perf_config_ulong(const char *, const char *);
+extern int perf_config_bool_or_int(const char *, const char *, int *);
+extern int perf_config_bool(const char *, const char *);
+extern int perf_config_string(const char **, const char *, const char *);
+extern int perf_config_set(const char *, const char *);
+extern int perf_config_set_multivar(const char *, const char *, const char *, int);
+extern int perf_config_rename_section(const char *, const char *);
+extern const char *perf_etc_perfconfig(void);
+extern int check_repository_format_version(const char *var, const char *value, void *cb);
+extern int perf_config_system(void);
+extern int perf_config_global(void);
+extern int config_error_nonbool(const char *);
+extern const char *config_exclusive_filename;
+
+#define MAX_PERFNAME (1000)
+extern char perf_default_email[MAX_PERFNAME];
+extern char perf_default_name[MAX_PERFNAME];
+extern int user_ident_explicitly_given;
+
+extern const char *perf_log_output_encoding;
+extern const char *perf_mailmap_file;
+
+/* IO helper functions */
+extern void maybe_flush_or_die(FILE *, const char *);
+extern int copy_fd(int ifd, int ofd);
+extern int copy_file(const char *dst, const char *src, int mode);
+extern ssize_t read_in_full(int fd, void *buf, size_t count);
+extern ssize_t write_in_full(int fd, const void *buf, size_t count);
+extern void write_or_die(int fd, const void *buf, size_t count);
+extern int write_or_whine(int fd, const void *buf, size_t count, const char *msg);
+extern int write_or_whine_pipe(int fd, const void *buf, size_t count, const char *msg);
+extern void fsync_or_die(int fd, const char *);
+
+/* pager.c */
+extern void setup_pager(void);
+extern const char *pager_program;
+extern int pager_in_use(void);
+extern int pager_use_color;
+
+extern const char *editor_program;
+extern const char *excludes_file;
+
+char *alias_lookup(const char *alias);
+int split_cmdline(char *cmdline, const char ***argv);
+
+#define alloc_nr(x) (((x)+16)*3/2)
+
+/*
+ * Realloc the buffer pointed at by variable 'x' so that it can hold
+ * at least 'nr' entries; the number of entries currently allocated
+ * is 'alloc', using the standard growing factor alloc_nr() macro.
+ *
+ * DO NOT USE any expression with side-effect for 'x' or 'alloc'.
+ */
+#define ALLOC_GROW(x, nr, alloc) \
+       do { \
+               if ((nr) > alloc) { \
+                       if (alloc_nr(alloc) < (nr)) \
+                               alloc = (nr); \
+                       else \
+                               alloc = alloc_nr(alloc); \
+                       x = xrealloc((x), alloc * sizeof(*(x))); \
+               } \
+       } while(0)
+
+
+static inline int is_absolute_path(const char *path)
+{
+       return path[0] == '/';
+}
+
+const char *make_absolute_path(const char *path);
+const char *make_nonrelative_path(const char *path);
+const char *make_relative_path(const char *abs, const char *base);
+int normalize_path_copy(char *dst, const char *src);
+int longest_ancestor_length(const char *path, const char *prefix_list);
+char *strip_path_suffix(const char *path, const char *suffix);
+
+extern char *mkpath(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
+extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
+/* perf_mkstemp() - create tmp file honoring TMPDIR variable */
+extern int perf_mkstemp(char *path, size_t len, const char *template);
+
+extern char *mksnpath(char *buf, size_t n, const char *fmt, ...)
+       __attribute__((format (printf, 3, 4)));
+extern char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
+       __attribute__((format (printf, 3, 4)));
+extern char *perf_pathdup(const char *fmt, ...)
+       __attribute__((format (printf, 1, 2)));
+
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+
+#endif /* CACHE_H */
diff --git a/tools/perf/util/color.c b/tools/perf/util/color.c

new file mode 100644 (file)

index 0000000..9a8c20c
--- /dev/null
+++ b/tools/perf/util/color.c
@@ -0,0 +1,241 @@
+#include "cache.h"
+#include "color.h"
+
+int perf_use_color_default = -1;
+
+static int parse_color(const char *name, int len)
+{
+       static const char * const color_names[] = {
+               "normal", "black", "red", "green", "yellow",
+               "blue", "magenta", "cyan", "white"
+       };
+       char *end;
+       int i;
+       for (i = 0; i < ARRAY_SIZE(color_names); i++) {
+               const char *str = color_names[i];
+               if (!strncasecmp(name, str, len) && !str[len])
+                       return i - 1;
+       }
+       i = strtol(name, &end, 10);
+       if (end - name == len && i >= -1 && i <= 255)
+               return i;
+       return -2;
+}
+
+static int parse_attr(const char *name, int len)
+{
+       static const int attr_values[] = { 1, 2, 4, 5, 7 };
+       static const char * const attr_names[] = {
+               "bold", "dim", "ul", "blink", "reverse"
+       };
+       int i;
+       for (i = 0; i < ARRAY_SIZE(attr_names); i++) {
+               const char *str = attr_names[i];
+               if (!strncasecmp(name, str, len) && !str[len])
+                       return attr_values[i];
+       }
+       return -1;
+}
+
+void color_parse(const char *value, const char *var, char *dst)
+{
+       color_parse_mem(value, strlen(value), var, dst);
+}
+
+void color_parse_mem(const char *value, int value_len, const char *var,
+               char *dst)
+{
+       const char *ptr = value;
+       int len = value_len;
+       int attr = -1;
+       int fg = -2;
+       int bg = -2;
+
+       if (!strncasecmp(value, "reset", len)) {
+               strcpy(dst, PERF_COLOR_RESET);
+               return;
+       }
+
+       /* [fg [bg]] [attr] */
+       while (len > 0) {
+               const char *word = ptr;
+               int val, wordlen = 0;
+
+               while (len > 0 && !isspace(word[wordlen])) {
+                       wordlen++;
+                       len--;
+               }
+
+               ptr = word + wordlen;
+               while (len > 0 && isspace(*ptr)) {
+                       ptr++;
+                       len--;
+               }
+
+               val = parse_color(word, wordlen);
+               if (val >= -1) {
+                       if (fg == -2) {
+                               fg = val;
+                               continue;
+                       }
+                       if (bg == -2) {
+                               bg = val;
+                               continue;
+                       }
+                       goto bad;
+               }
+               val = parse_attr(word, wordlen);
+               if (val < 0 || attr != -1)
+                       goto bad;
+               attr = val;
+       }
+
+       if (attr >= 0 || fg >= 0 || bg >= 0) {
+               int sep = 0;
+
+               *dst++ = '\033';
+               *dst++ = '[';
+               if (attr >= 0) {
+                       *dst++ = '0' + attr;
+                       sep++;
+               }
+               if (fg >= 0) {
+                       if (sep++)
+                               *dst++ = ';';
+                       if (fg < 8) {
+                               *dst++ = '3';
+                               *dst++ = '0' + fg;
+                       } else {
+                               dst += sprintf(dst, "38;5;%d", fg);
+                       }
+               }
+               if (bg >= 0) {
+                       if (sep++)
+                               *dst++ = ';';
+                       if (bg < 8) {
+                               *dst++ = '4';
+                               *dst++ = '0' + bg;
+                       } else {
+                               dst += sprintf(dst, "48;5;%d", bg);
+                       }
+               }
+               *dst++ = 'm';
+       }
+       *dst = 0;
+       return;
+bad:
+       die("bad color value '%.*s' for variable '%s'", value_len, value, var);
+}
+
+int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty)
+{
+       if (value) {
+               if (!strcasecmp(value, "never"))
+                       return 0;
+               if (!strcasecmp(value, "always"))
+                       return 1;
+               if (!strcasecmp(value, "auto"))
+                       goto auto_color;
+       }
+
+       /* Missing or explicit false to turn off colorization */
+       if (!perf_config_bool(var, value))
+               return 0;
+
+       /* any normal truth value defaults to 'auto' */
+ auto_color:
+       if (stdout_is_tty < 0)
+               stdout_is_tty = isatty(1);
+       if (stdout_is_tty || (pager_in_use() && pager_use_color)) {
+               char *term = getenv("TERM");
+               if (term && strcmp(term, "dumb"))
+                       return 1;
+       }
+       return 0;
+}
+
+int perf_color_default_config(const char *var, const char *value, void *cb)
+{
+       if (!strcmp(var, "color.ui")) {
+               perf_use_color_default = perf_config_colorbool(var, value, -1);
+               return 0;
+       }
+
+       return perf_default_config(var, value, cb);
+}
+
+static int color_vfprintf(FILE *fp, const char *color, const char *fmt,
+               va_list args, const char *trail)
+{
+       int r = 0;
+
+       /*
+        * Auto-detect:
+        */
+       if (perf_use_color_default < 0) {
+               if (isatty(1) || pager_in_use())
+                       perf_use_color_default = 1;
+               else
+                       perf_use_color_default = 0;
+       }
+
+       if (perf_use_color_default && *color)
+               r += fprintf(fp, "%s", color);
+       r += vfprintf(fp, fmt, args);
+       if (perf_use_color_default && *color)
+               r += fprintf(fp, "%s", PERF_COLOR_RESET);
+       if (trail)
+               r += fprintf(fp, "%s", trail);
+       return r;
+}
+
+
+
+int color_fprintf(FILE *fp, const char *color, const char *fmt, ...)
+{
+       va_list args;
+       int r;
+
+       va_start(args, fmt);
+       r = color_vfprintf(fp, color, fmt, args, NULL);
+       va_end(args);
+       return r;
+}
+
+int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...)
+{
+       va_list args;
+       int r;
+       va_start(args, fmt);
+       r = color_vfprintf(fp, color, fmt, args, "\n");
+       va_end(args);
+       return r;
+}
+
+/*
+ * This function splits the buffer by newlines and colors the lines individually.
+ *
+ * Returns 0 on success.
+ */
+int color_fwrite_lines(FILE *fp, const char *color,
+               size_t count, const char *buf)
+{
+       if (!*color)
+               return fwrite(buf, count, 1, fp) != 1;
+       while (count) {
+               char *p = memchr(buf, '\n', count);
+               if (p != buf && (fputs(color, fp) < 0 ||
+                               fwrite(buf, p ? p - buf : count, 1, fp) != 1 ||
+                               fputs(PERF_COLOR_RESET, fp) < 0))
+                       return -1;
+               if (!p)
+                       return 0;
+               if (fputc('\n', fp) < 0)
+                       return -1;
+               count -= p + 1 - buf;
+               buf = p + 1;
+       }
+       return 0;
+}
+
+
diff --git a/tools/perf/util/color.h b/tools/perf/util/color.h

new file mode 100644 (file)

index 0000000..5abfd37
--- /dev/null
+++ b/tools/perf/util/color.h
@@ -0,0 +1,36 @@
+#ifndef COLOR_H
+#define COLOR_H
+
+/* "\033[1;38;5;2xx;48;5;2xxm\0" is 23 bytes */
+#define COLOR_MAXLEN 24
+
+#define PERF_COLOR_NORMAL      ""
+#define PERF_COLOR_RESET       "\033[m"
+#define PERF_COLOR_BOLD                "\033[1m"
+#define PERF_COLOR_RED         "\033[31m"
+#define PERF_COLOR_GREEN       "\033[32m"
+#define PERF_COLOR_YELLOW      "\033[33m"
+#define PERF_COLOR_BLUE                "\033[34m"
+#define PERF_COLOR_MAGENTA     "\033[35m"
+#define PERF_COLOR_CYAN                "\033[36m"
+#define PERF_COLOR_BG_RED      "\033[41m"
+
+/*
+ * This variable stores the value of color.ui
+ */
+extern int perf_use_color_default;
+
+
+/*
+ * Use this instead of perf_default_config if you need the value of color.ui.
+ */
+int perf_color_default_config(const char *var, const char *value, void *cb);
+
+int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty);
+void color_parse(const char *value, const char *var, char *dst);
+void color_parse_mem(const char *value, int len, const char *var, char *dst);
+int color_fprintf(FILE *fp, const char *color, const char *fmt, ...);
+int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...);
+int color_fwrite_lines(FILE *fp, const char *color, size_t count, const char *buf);
+
+#endif /* COLOR_H */
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c

new file mode 100644 (file)

index 0000000..3dd13fa
--- /dev/null
+++ b/tools/perf/util/config.c
@@ -0,0 +1,873 @@
+/*
+ * GIT - The information manager from hell
+ *
+ * Copyright (C) Linus Torvalds, 2005
+ * Copyright (C) Johannes Schindelin, 2005
+ *
+ */
+#include "util.h"
+#include "cache.h"
+#include "exec_cmd.h"
+
+#define MAXNAME (256)
+
+static FILE *config_file;
+static const char *config_file_name;
+static int config_linenr;
+static int config_file_eof;
+
+const char *config_exclusive_filename = NULL;
+
+static int get_next_char(void)
+{
+       int c;
+       FILE *f;
+
+       c = '\n';
+       if ((f = config_file) != NULL) {
+               c = fgetc(f);
+               if (c == '\r') {
+                       /* DOS like systems */
+                       c = fgetc(f);
+                       if (c != '\n') {
+                               ungetc(c, f);
+                               c = '\r';
+                       }
+               }
+               if (c == '\n')
+                       config_linenr++;
+               if (c == EOF) {
+                       config_file_eof = 1;
+                       c = '\n';
+               }
+       }
+       return c;
+}
+
+static char *parse_value(void)
+{
+       static char value[1024];
+       int quote = 0, comment = 0, len = 0, space = 0;
+
+       for (;;) {
+               int c = get_next_char();
+               if (len >= sizeof(value) - 1)
+                       return NULL;
+               if (c == '\n') {
+                       if (quote)
+                               return NULL;
+                       value[len] = 0;
+                       return value;
+               }
+               if (comment)
+                       continue;
+               if (isspace(c) && !quote) {
+                       space = 1;
+                       continue;
+               }
+               if (!quote) {
+                       if (c == ';' || c == '#') {
+                               comment = 1;
+                               continue;
+                       }
+               }
+               if (space) {
+                       if (len)
+                               value[len++] = ' ';
+                       space = 0;
+               }
+               if (c == '\\') {
+                       c = get_next_char();
+                       switch (c) {
+                       case '\n':
+                               continue;
+                       case 't':
+                               c = '\t';
+                               break;
+                       case 'b':
+                               c = '\b';
+                               break;
+                       case 'n':
+                               c = '\n';
+                               break;
+                       /* Some characters escape as themselves */
+                       case '\\': case '"':
+                               break;
+                       /* Reject unknown escape sequences */
+                       default:
+                               return NULL;
+                       }
+                       value[len++] = c;
+                       continue;
+               }
+               if (c == '"') {
+                       quote = 1-quote;
+                       continue;
+               }
+               value[len++] = c;
+       }
+}
+
+static inline int iskeychar(int c)
+{
+       return isalnum(c) || c == '-';
+}
+
+static int get_value(config_fn_t fn, void *data, char *name, unsigned int len)
+{
+       int c;
+       char *value;
+
+       /* Get the full name */
+       for (;;) {
+               c = get_next_char();
+               if (config_file_eof)
+                       break;
+               if (!iskeychar(c))
+                       break;
+               name[len++] = tolower(c);
+               if (len >= MAXNAME)
+                       return -1;
+       }
+       name[len] = 0;
+       while (c == ' ' || c == '\t')
+               c = get_next_char();
+
+       value = NULL;
+       if (c != '\n') {
+               if (c != '=')
+                       return -1;
+               value = parse_value();
+               if (!value)
+                       return -1;
+       }
+       return fn(name, value, data);
+}
+
+static int get_extended_base_var(char *name, int baselen, int c)
+{
+       do {
+               if (c == '\n')
+                       return -1;
+               c = get_next_char();
+       } while (isspace(c));
+
+       /* We require the format to be '[base "extension"]' */
+       if (c != '"')
+               return -1;
+       name[baselen++] = '.';
+
+       for (;;) {
+               int c = get_next_char();
+               if (c == '\n')
+                       return -1;
+               if (c == '"')
+                       break;
+               if (c == '\\') {
+                       c = get_next_char();
+                       if (c == '\n')
+                               return -1;
+               }
+               name[baselen++] = c;
+               if (baselen > MAXNAME / 2)
+                       return -1;
+       }
+
+       /* Final ']' */
+       if (get_next_char() != ']')
+               return -1;
+       return baselen;
+}
+
+static int get_base_var(char *name)
+{
+       int baselen = 0;
+
+       for (;;) {
+               int c = get_next_char();
+               if (config_file_eof)
+                       return -1;
+               if (c == ']')
+                       return baselen;
+               if (isspace(c))
+                       return get_extended_base_var(name, baselen, c);
+               if (!iskeychar(c) && c != '.')
+                       return -1;
+               if (baselen > MAXNAME / 2)
+                       return -1;
+               name[baselen++] = tolower(c);
+       }
+}
+
+static int perf_parse_file(config_fn_t fn, void *data)
+{
+       int comment = 0;
+       int baselen = 0;
+       static char var[MAXNAME];
+
+       /* U+FEFF Byte Order Mark in UTF8 */
+       static const unsigned char *utf8_bom = (unsigned char *) "\xef\xbb\xbf";
+       const unsigned char *bomptr = utf8_bom;
+
+       for (;;) {
+               int c = get_next_char();
+               if (bomptr && *bomptr) {
+                       /* We are at the file beginning; skip UTF8-encoded BOM
+                        * if present. Sane editors won't put this in on their
+                        * own, but e.g. Windows Notepad will do it happily. */
+                       if ((unsigned char) c == *bomptr) {
+                               bomptr++;
+                               continue;
+                       } else {
+                               /* Do not tolerate partial BOM. */
+                               if (bomptr != utf8_bom)
+                                       break;
+                               /* No BOM at file beginning. Cool. */
+                               bomptr = NULL;
+                       }
+               }
+               if (c == '\n') {
+                       if (config_file_eof)
+                               return 0;
+                       comment = 0;
+                       continue;
+               }
+               if (comment || isspace(c))
+                       continue;
+               if (c == '#' || c == ';') {
+                       comment = 1;
+                       continue;
+               }
+               if (c == '[') {
+                       baselen = get_base_var(var);
+                       if (baselen <= 0)
+                               break;
+                       var[baselen++] = '.';
+                       var[baselen] = 0;
+                       continue;
+               }
+               if (!isalpha(c))
+                       break;
+               var[baselen] = tolower(c);
+               if (get_value(fn, data, var, baselen+1) < 0)
+                       break;
+       }
+       die("bad config file line %d in %s", config_linenr, config_file_name);
+}
+
+static int parse_unit_factor(const char *end, unsigned long *val)
+{
+       if (!*end)
+               return 1;
+       else if (!strcasecmp(end, "k")) {
+               *val *= 1024;
+               return 1;
+       }
+       else if (!strcasecmp(end, "m")) {
+               *val *= 1024 * 1024;
+               return 1;
+       }
+       else if (!strcasecmp(end, "g")) {
+               *val *= 1024 * 1024 * 1024;
+               return 1;
+       }
+       return 0;
+}
+
+static int perf_parse_long(const char *value, long *ret)
+{
+       if (value && *value) {
+               char *end;
+               long val = strtol(value, &end, 0);
+               unsigned long factor = 1;
+               if (!parse_unit_factor(end, &factor))
+                       return 0;
+               *ret = val * factor;
+               return 1;
+       }
+       return 0;
+}
+
+int perf_parse_ulong(const char *value, unsigned long *ret)
+{
+       if (value && *value) {
+               char *end;
+               unsigned long val = strtoul(value, &end, 0);
+               if (!parse_unit_factor(end, &val))
+                       return 0;
+               *ret = val;
+               return 1;
+       }
+       return 0;
+}
+
+static void die_bad_config(const char *name)
+{
+       if (config_file_name)
+               die("bad config value for '%s' in %s", name, config_file_name);
+       die("bad config value for '%s'", name);
+}
+
+int perf_config_int(const char *name, const char *value)
+{
+       long ret = 0;
+       if (!perf_parse_long(value, &ret))
+               die_bad_config(name);
+       return ret;
+}
+
+unsigned long perf_config_ulong(const char *name, const char *value)
+{
+       unsigned long ret;
+       if (!perf_parse_ulong(value, &ret))
+               die_bad_config(name);
+       return ret;
+}
+
+int perf_config_bool_or_int(const char *name, const char *value, int *is_bool)
+{
+       *is_bool = 1;
+       if (!value)
+               return 1;
+       if (!*value)
+               return 0;
+       if (!strcasecmp(value, "true") || !strcasecmp(value, "yes") || !strcasecmp(value, "on"))
+               return 1;
+       if (!strcasecmp(value, "false") || !strcasecmp(value, "no") || !strcasecmp(value, "off"))
+               return 0;
+       *is_bool = 0;
+       return perf_config_int(name, value);
+}
+
+int perf_config_bool(const char *name, const char *value)
+{
+       int discard;
+       return !!perf_config_bool_or_int(name, value, &discard);
+}
+
+int perf_config_string(const char **dest, const char *var, const char *value)
+{
+       if (!value)
+               return config_error_nonbool(var);
+       *dest = strdup(value);
+       return 0;
+}
+
+static int perf_default_core_config(const char *var, const char *value)
+{
+       /* Add other config variables here and to Documentation/config.txt. */
+       return 0;
+}
+
+int perf_default_config(const char *var, const char *value, void *dummy)
+{
+       if (!prefixcmp(var, "core."))
+               return perf_default_core_config(var, value);
+
+       /* Add other config variables here and to Documentation/config.txt. */
+       return 0;
+}
+
+int perf_config_from_file(config_fn_t fn, const char *filename, void *data)
+{
+       int ret;
+       FILE *f = fopen(filename, "r");
+
+       ret = -1;
+       if (f) {
+               config_file = f;
+               config_file_name = filename;
+               config_linenr = 1;
+               config_file_eof = 0;
+               ret = perf_parse_file(fn, data);
+               fclose(f);
+               config_file_name = NULL;
+       }
+       return ret;
+}
+
+const char *perf_etc_perfconfig(void)
+{
+       static const char *system_wide;
+       if (!system_wide)
+               system_wide = system_path(ETC_PERFCONFIG);
+       return system_wide;
+}
+
+static int perf_env_bool(const char *k, int def)
+{
+       const char *v = getenv(k);
+       return v ? perf_config_bool(k, v) : def;
+}
+
+int perf_config_system(void)
+{
+       return !perf_env_bool("PERF_CONFIG_NOSYSTEM", 0);
+}
+
+int perf_config_global(void)
+{
+       return !perf_env_bool("PERF_CONFIG_NOGLOBAL", 0);
+}
+
+int perf_config(config_fn_t fn, void *data)
+{
+       int ret = 0, found = 0;
+       char *repo_config = NULL;
+       const char *home = NULL;
+
+       /* Setting $PERF_CONFIG makes perf read _only_ the given config file. */
+       if (config_exclusive_filename)
+               return perf_config_from_file(fn, config_exclusive_filename, data);
+       if (perf_config_system() && !access(perf_etc_perfconfig(), R_OK)) {
+               ret += perf_config_from_file(fn, perf_etc_perfconfig(),
+                                           data);
+               found += 1;
+       }
+
+       home = getenv("HOME");
+       if (perf_config_global() && home) {
+               char *user_config = strdup(mkpath("%s/.perfconfig", home));
+               if (!access(user_config, R_OK)) {
+                       ret += perf_config_from_file(fn, user_config, data);
+                       found += 1;
+               }
+               free(user_config);
+       }
+
+       repo_config = perf_pathdup("config");
+       if (!access(repo_config, R_OK)) {
+               ret += perf_config_from_file(fn, repo_config, data);
+               found += 1;
+       }
+       free(repo_config);
+       if (found == 0)
+               return -1;
+       return ret;
+}
+
+/*
+ * Find all the stuff for perf_config_set() below.
+ */
+
+#define MAX_MATCHES 512
+
+static struct {
+       int baselen;
+       char* key;
+       int do_not_match;
+       regex_t* value_regex;
+       int multi_replace;
+       size_t offset[MAX_MATCHES];
+       enum { START, SECTION_SEEN, SECTION_END_SEEN, KEY_SEEN } state;
+       int seen;
+} store;
+
+static int matches(const char* key, const char* value)
+{
+       return !strcmp(key, store.key) &&
+               (store.value_regex == NULL ||
+                (store.do_not_match ^
+                 !regexec(store.value_regex, value, 0, NULL, 0)));
+}
+
+static int store_aux(const char* key, const char* value, void *cb)
+{
+       const char *ep;
+       size_t section_len;
+
+       switch (store.state) {
+       case KEY_SEEN:
+               if (matches(key, value)) {
+                       if (store.seen == 1 && store.multi_replace == 0) {
+                               warning("%s has multiple values", key);
+                       } else if (store.seen >= MAX_MATCHES) {
+                               error("too many matches for %s", key);
+                               return 1;
+                       }
+
+                       store.offset[store.seen] = ftell(config_file);
+                       store.seen++;
+               }
+               break;
+       case SECTION_SEEN:
+               /*
+                * What we are looking for is in store.key (both
+                * section and var), and its section part is baselen
+                * long.  We found key (again, both section and var).
+                * We would want to know if this key is in the same
+                * section as what we are looking for.  We already
+                * know we are in the same section as what should
+                * hold store.key.
+                */
+               ep = strrchr(key, '.');
+               section_len = ep - key;
+
+               if ((section_len != store.baselen) ||
+                   memcmp(key, store.key, section_len+1)) {
+                       store.state = SECTION_END_SEEN;
+                       break;
+               }
+
+               /*
+                * Do not increment matches: this is no match, but we
+                * just made sure we are in the desired section.
+                */
+               store.offset[store.seen] = ftell(config_file);
+               /* fallthru */
+       case SECTION_END_SEEN:
+       case START:
+               if (matches(key, value)) {
+                       store.offset[store.seen] = ftell(config_file);
+                       store.state = KEY_SEEN;
+                       store.seen++;
+               } else {
+                       if (strrchr(key, '.') - key == store.baselen &&
+                             !strncmp(key, store.key, store.baselen)) {
+                                       store.state = SECTION_SEEN;
+                                       store.offset[store.seen] = ftell(config_file);
+                       }
+               }
+       }
+       return 0;
+}
+
+static int store_write_section(int fd, const char* key)
+{
+       const char *dot;
+       int i, success;
+       struct strbuf sb = STRBUF_INIT;
+
+       dot = memchr(key, '.', store.baselen);
+       if (dot) {
+               strbuf_addf(&sb, "[%.*s \"", (int)(dot - key), key);
+               for (i = dot - key + 1; i < store.baselen; i++) {
+                       if (key[i] == '"' || key[i] == '\\')
+                               strbuf_addch(&sb, '\\');
+                       strbuf_addch(&sb, key[i]);
+               }
+               strbuf_addstr(&sb, "\"]\n");
+       } else {
+               strbuf_addf(&sb, "[%.*s]\n", store.baselen, key);
+       }
+
+       success = write_in_full(fd, sb.buf, sb.len) == sb.len;
+       strbuf_release(&sb);
+
+       return success;
+}
+
+static int store_write_pair(int fd, const char* key, const char* value)
+{
+       int i, success;
+       int length = strlen(key + store.baselen + 1);
+       const char *quote = "";
+       struct strbuf sb = STRBUF_INIT;
+
+       /*
+        * Check to see if the value needs to be surrounded with a dq pair.
+        * Note that problematic characters are always backslash-quoted; this
+        * check is about not losing leading or trailing SP and strings that
+        * follow beginning-of-comment characters (i.e. ';' and '#') by the
+        * configuration parser.
+        */
+       if (value[0] == ' ')
+               quote = "\"";
+       for (i = 0; value[i]; i++)
+               if (value[i] == ';' || value[i] == '#')
+                       quote = "\"";
+       if (i && value[i - 1] == ' ')
+               quote = "\"";
+
+       strbuf_addf(&sb, "\t%.*s = %s",
+                   length, key + store.baselen + 1, quote);
+
+       for (i = 0; value[i]; i++)
+               switch (value[i]) {
+               case '\n':
+                       strbuf_addstr(&sb, "\\n");
+                       break;
+               case '\t':
+                       strbuf_addstr(&sb, "\\t");
+                       break;
+               case '"':
+               case '\\':
+                       strbuf_addch(&sb, '\\');
+               default:
+                       strbuf_addch(&sb, value[i]);
+                       break;
+               }
+       strbuf_addf(&sb, "%s\n", quote);
+
+       success = write_in_full(fd, sb.buf, sb.len) == sb.len;
+       strbuf_release(&sb);
+
+       return success;
+}
+
+static ssize_t find_beginning_of_line(const char* contents, size_t size,
+       size_t offset_, int* found_bracket)
+{
+       size_t equal_offset = size, bracket_offset = size;
+       ssize_t offset;
+
+contline:
+       for (offset = offset_-2; offset > 0
+                       && contents[offset] != '\n'; offset--)
+               switch (contents[offset]) {
+                       case '=': equal_offset = offset; break;
+                       case ']': bracket_offset = offset; break;
+               }
+       if (offset > 0 && contents[offset-1] == '\\') {
+               offset_ = offset;
+               goto contline;
+       }
+       if (bracket_offset < equal_offset) {
+               *found_bracket = 1;
+               offset = bracket_offset+1;
+       } else
+               offset++;
+
+       return offset;
+}
+
+int perf_config_set(const char* key, const char* value)
+{
+       return perf_config_set_multivar(key, value, NULL, 0);
+}
+
+/*
+ * If value==NULL, unset in (remove from) config,
+ * if value_regex!=NULL, disregard key/value pairs where value does not match.
+ * if multi_replace==0, nothing, or only one matching key/value is replaced,
+ *     else all matching key/values (regardless how many) are removed,
+ *     before the new pair is written.
+ *
+ * Returns 0 on success.
+ *
+ * This function does this:
+ *
+ * - it locks the config file by creating ".perf/config.lock"
+ *
+ * - it then parses the config using store_aux() as validator to find
+ *   the position on the key/value pair to replace. If it is to be unset,
+ *   it must be found exactly once.
+ *
+ * - the config file is mmap()ed and the part before the match (if any) is
+ *   written to the lock file, then the changed part and the rest.
+ *
+ * - the config file is removed and the lock file rename()d to it.
+ *
+ */
+int perf_config_set_multivar(const char* key, const char* value,
+       const char* value_regex, int multi_replace)
+{
+       int i, dot;
+       int fd = -1, in_fd;
+       int ret = 0;
+       char* config_filename;
+       const char* last_dot = strrchr(key, '.');
+
+       if (config_exclusive_filename)
+               config_filename = strdup(config_exclusive_filename);
+       else
+               config_filename = perf_pathdup("config");
+
+       /*
+        * Since "key" actually contains the section name and the real
+        * key name separated by a dot, we have to know where the dot is.
+        */
+
+       if (last_dot == NULL) {
+               error("key does not contain a section: %s", key);
+               ret = 2;
+               goto out_free;
+       }
+       store.baselen = last_dot - key;
+
+       store.multi_replace = multi_replace;
+
+       /*
+        * Validate the key and while at it, lower case it for matching.
+        */
+       store.key = malloc(strlen(key) + 1);
+       dot = 0;
+       for (i = 0; key[i]; i++) {
+               unsigned char c = key[i];
+               if (c == '.')
+                       dot = 1;
+               /* Leave the extended basename untouched.. */
+               if (!dot || i > store.baselen) {
+                       if (!iskeychar(c) || (i == store.baselen+1 && !isalpha(c))) {
+                               error("invalid key: %s", key);
+                               free(store.key);
+                               ret = 1;
+                               goto out_free;
+                       }
+                       c = tolower(c);
+               } else if (c == '\n') {
+                       error("invalid key (newline): %s", key);
+                       free(store.key);
+                       ret = 1;
+                       goto out_free;
+               }
+               store.key[i] = c;
+       }
+       store.key[i] = 0;
+
+       /*
+        * If .perf/config does not exist yet, write a minimal version.
+        */
+       in_fd = open(config_filename, O_RDONLY);
+       if ( in_fd < 0 ) {
+               free(store.key);
+
+               if ( ENOENT != errno ) {
+                       error("opening %s: %s", config_filename,
+                             strerror(errno));
+                       ret = 3; /* same as "invalid config file" */
+                       goto out_free;
+               }
+               /* if nothing to unset, error out */
+               if (value == NULL) {
+                       ret = 5;
+                       goto out_free;
+               }
+
+               store.key = (char*)key;
+               if (!store_write_section(fd, key) ||
+                   !store_write_pair(fd, key, value))
+                       goto write_err_out;
+       } else {
+               struct stat st;
+               char* contents;
+               size_t contents_sz, copy_begin, copy_end;
+               int i, new_line = 0;
+
+               if (value_regex == NULL)
+                       store.value_regex = NULL;
+               else {
+                       if (value_regex[0] == '!') {
+                               store.do_not_match = 1;
+                               value_regex++;
+                       } else
+                               store.do_not_match = 0;
+
+                       store.value_regex = (regex_t*)malloc(sizeof(regex_t));
+                       if (regcomp(store.value_regex, value_regex,
+                                       REG_EXTENDED)) {
+                               error("invalid pattern: %s", value_regex);
+                               free(store.value_regex);
+                               ret = 6;
+                               goto out_free;
+                       }
+               }
+
+               store.offset[0] = 0;
+               store.state = START;
+               store.seen = 0;
+
+               /*
+                * After this, store.offset will contain the *end* offset
+                * of the last match, or remain at 0 if no match was found.
+                * As a side effect, we make sure to transform only a valid
+                * existing config file.
+                */
+               if (perf_config_from_file(store_aux, config_filename, NULL)) {
+                       error("invalid config file %s", config_filename);
+                       free(store.key);
+                       if (store.value_regex != NULL) {
+                               regfree(store.value_regex);
+                               free(store.value_regex);
+                       }
+                       ret = 3;
+                       goto out_free;
+               }
+
+               free(store.key);
+               if (store.value_regex != NULL) {
+                       regfree(store.value_regex);
+                       free(store.value_regex);
+               }
+
+               /* if nothing to unset, or too many matches, error out */
+               if ((store.seen == 0 && value == NULL) ||
+                               (store.seen > 1 && multi_replace == 0)) {
+                       ret = 5;
+                       goto out_free;
+               }
+
+               fstat(in_fd, &st);
+               contents_sz = xsize_t(st.st_size);
+               contents = mmap(NULL, contents_sz, PROT_READ,
+                       MAP_PRIVATE, in_fd, 0);
+               close(in_fd);
+
+               if (store.seen == 0)
+                       store.seen = 1;
+
+               for (i = 0, copy_begin = 0; i < store.seen; i++) {
+                       if (store.offset[i] == 0) {
+                               store.offset[i] = copy_end = contents_sz;
+                       } else if (store.state != KEY_SEEN) {
+                               copy_end = store.offset[i];
+                       } else
+                               copy_end = find_beginning_of_line(
+                                       contents, contents_sz,
+                                       store.offset[i]-2, &new_line);
+
+                       if (copy_end > 0 && contents[copy_end-1] != '\n')
+                               new_line = 1;
+
+                       /* write the first part of the config */
+                       if (copy_end > copy_begin) {
+                               if (write_in_full(fd, contents + copy_begin,
+                                                 copy_end - copy_begin) <
+                                   copy_end - copy_begin)
+                                       goto write_err_out;
+                               if (new_line &&
+                                   write_in_full(fd, "\n", 1) != 1)
+                                       goto write_err_out;
+                       }
+                       copy_begin = store.offset[i];
+               }
+
+               /* write the pair (value == NULL means unset) */
+               if (value != NULL) {
+                       if (store.state == START) {
+                               if (!store_write_section(fd, key))
+                                       goto write_err_out;
+                       }
+                       if (!store_write_pair(fd, key, value))
+                               goto write_err_out;
+               }
+
+               /* write the rest of the config */
+               if (copy_begin < contents_sz)
+                       if (write_in_full(fd, contents + copy_begin,
+                                         contents_sz - copy_begin) <
+                           contents_sz - copy_begin)
+                               goto write_err_out;
+
+               munmap(contents, contents_sz);
+       }
+
+       ret = 0;
+
+out_free:
+       free(config_filename);
+       return ret;
+
+write_err_out:
+       goto out_free;
+
+}
+
+/*
+ * Call this to report error for your variable that should not
+ * get a boolean value (i.e. "[my] var" means "true").
+ */
+int config_error_nonbool(const char *var)
+{
+       return error("Missing value for '%s'", var);
+}
diff --git a/tools/perf/util/ctype.c b/tools/perf/util/ctype.c

new file mode 100644 (file)

index 0000000..b90ec00
--- /dev/null
+++ b/tools/perf/util/ctype.c
@@ -0,0 +1,26 @@
+/*
+ * Sane locale-independent, ASCII ctype.
+ *
+ * No surprises, and works with signed and unsigned chars.
+ */
+#include "cache.h"
+
+enum {
+       S = GIT_SPACE,
+       A = GIT_ALPHA,
+       D = GIT_DIGIT,
+       G = GIT_GLOB_SPECIAL,   /* *, ?, [, \\ */
+       R = GIT_REGEX_SPECIAL,  /* $, (, ), +, ., ^, {, | * */
+};
+
+unsigned char sane_ctype[256] = {
+       0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, 0, S, 0, 0,         /*   0.. 15 */
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,         /*  16.. 31 */
+       S, 0, 0, 0, R, 0, 0, 0, R, R, G, R, 0, 0, R, 0,         /*  32.. 47 */
+       D, D, D, D, D, D, D, D, D, D, 0, 0, 0, 0, 0, G,         /*  48.. 63 */
+       0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,         /*  64.. 79 */
+       A, A, A, A, A, A, A, A, A, A, A, G, G, 0, R, 0,         /*  80.. 95 */
+       0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,         /*  96..111 */
+       A, A, A, A, A, A, A, A, A, A, A, R, R, 0, 0, 0,         /* 112..127 */
+       /* Nothing in the 128.. range */
+};
diff --git a/tools/perf/util/environment.c b/tools/perf/util/environment.c

new file mode 100644 (file)

index 0000000..275b0ee
--- /dev/null
+++ b/tools/perf/util/environment.c
@@ -0,0 +1,9 @@
+/*
+ * We put all the perf config variables in this same object
+ * file, so that programs can link against the config parser
+ * without having to link against all the rest of perf.
+ */
+#include "cache.h"
+
+const char *pager_program;
+int pager_use_color = 1;
diff --git a/tools/perf/util/exec_cmd.c b/tools/perf/util/exec_cmd.c

new file mode 100644 (file)

index 0000000..d392922
--- /dev/null
+++ b/tools/perf/util/exec_cmd.c
@@ -0,0 +1,165 @@
+#include "cache.h"
+#include "exec_cmd.h"
+#include "quote.h"
+#define MAX_ARGS       32
+
+extern char **environ;
+static const char *argv_exec_path;
+static const char *argv0_path;
+
+const char *system_path(const char *path)
+{
+#ifdef RUNTIME_PREFIX
+       static const char *prefix;
+#else
+       static const char *prefix = PREFIX;
+#endif
+       struct strbuf d = STRBUF_INIT;
+
+       if (is_absolute_path(path))
+               return path;
+
+#ifdef RUNTIME_PREFIX
+       assert(argv0_path);
+       assert(is_absolute_path(argv0_path));
+
+       if (!prefix &&
+           !(prefix = strip_path_suffix(argv0_path, PERF_EXEC_PATH)) &&
+           !(prefix = strip_path_suffix(argv0_path, BINDIR)) &&
+           !(prefix = strip_path_suffix(argv0_path, "perf"))) {
+               prefix = PREFIX;
+               fprintf(stderr, "RUNTIME_PREFIX requested, "
+                               "but prefix computation failed.  "
+                               "Using static fallback '%s'.\n", prefix);
+       }
+#endif
+
+       strbuf_addf(&d, "%s/%s", prefix, path);
+       path = strbuf_detach(&d, NULL);
+       return path;
+}
+
+const char *perf_extract_argv0_path(const char *argv0)
+{
+       const char *slash;
+
+       if (!argv0 || !*argv0)
+               return NULL;
+       slash = argv0 + strlen(argv0);
+
+       while (argv0 <= slash && !is_dir_sep(*slash))
+               slash--;
+
+       if (slash >= argv0) {
+               argv0_path = strndup(argv0, slash - argv0);
+               return slash + 1;
+       }
+
+       return argv0;
+}
+
+void perf_set_argv_exec_path(const char *exec_path)
+{
+       argv_exec_path = exec_path;
+       /*
+        * Propagate this setting to external programs.
+        */
+       setenv(EXEC_PATH_ENVIRONMENT, exec_path, 1);
+}
+
+
+/* Returns the highest-priority, location to look for perf programs. */
+const char *perf_exec_path(void)
+{
+       const char *env;
+
+       if (argv_exec_path)
+               return argv_exec_path;
+
+       env = getenv(EXEC_PATH_ENVIRONMENT);
+       if (env && *env) {
+               return env;
+       }
+
+       return system_path(PERF_EXEC_PATH);
+}
+
+static void add_path(struct strbuf *out, const char *path)
+{
+       if (path && *path) {
+               if (is_absolute_path(path))
+                       strbuf_addstr(out, path);
+               else
+                       strbuf_addstr(out, make_nonrelative_path(path));
+
+               strbuf_addch(out, PATH_SEP);
+       }
+}
+
+void setup_path(void)
+{
+       const char *old_path = getenv("PATH");
+       struct strbuf new_path = STRBUF_INIT;
+
+       add_path(&new_path, perf_exec_path());
+       add_path(&new_path, argv0_path);
+
+       if (old_path)
+               strbuf_addstr(&new_path, old_path);
+       else
+               strbuf_addstr(&new_path, "/usr/local/bin:/usr/bin:/bin");
+
+       setenv("PATH", new_path.buf, 1);
+
+       strbuf_release(&new_path);
+}
+
+const char **prepare_perf_cmd(const char **argv)
+{
+       int argc;
+       const char **nargv;
+
+       for (argc = 0; argv[argc]; argc++)
+               ; /* just counting */
+       nargv = malloc(sizeof(*nargv) * (argc + 2));
+
+       nargv[0] = "perf";
+       for (argc = 0; argv[argc]; argc++)
+               nargv[argc + 1] = argv[argc];
+       nargv[argc + 1] = NULL;
+       return nargv;
+}
+
+int execv_perf_cmd(const char **argv) {
+       const char **nargv = prepare_perf_cmd(argv);
+
+       /* execvp() can only ever return if it fails */
+       execvp("perf", (char **)nargv);
+
+       free(nargv);
+       return -1;
+}
+
+
+int execl_perf_cmd(const char *cmd,...)
+{
+       int argc;
+       const char *argv[MAX_ARGS + 1];
+       const char *arg;
+       va_list param;
+
+       va_start(param, cmd);
+       argv[0] = cmd;
+       argc = 1;
+       while (argc < MAX_ARGS) {
+               arg = argv[argc++] = va_arg(param, char *);
+               if (!arg)
+                       break;
+       }
+       va_end(param);
+       if (MAX_ARGS <= argc)
+               return error("too many args to run %s", cmd);
+
+       argv[argc] = NULL;
+       return execv_perf_cmd(argv);
+}
diff --git a/tools/perf/util/exec_cmd.h b/tools/perf/util/exec_cmd.h

new file mode 100644 (file)

index 0000000..effe25e
--- /dev/null
+++ b/tools/perf/util/exec_cmd.h
@@ -0,0 +1,13 @@
+#ifndef PERF_EXEC_CMD_H
+#define PERF_EXEC_CMD_H
+
+extern void perf_set_argv_exec_path(const char *exec_path);
+extern const char *perf_extract_argv0_path(const char *path);
+extern const char *perf_exec_path(void);
+extern void setup_path(void);
+extern const char **prepare_perf_cmd(const char **argv);
+extern int execv_perf_cmd(const char **argv); /* NULL terminated */
+extern int execl_perf_cmd(const char *cmd, ...);
+extern const char *system_path(const char *path);
+
+#endif /* PERF_EXEC_CMD_H */
diff --git a/tools/perf/util/generate-cmdlist.sh b/tools/perf/util/generate-cmdlist.sh

new file mode 100755 (executable)

index 0000000..f06f6fd
--- /dev/null
+++ b/tools/perf/util/generate-cmdlist.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+echo "/* Automatically generated by $0 */
+struct cmdname_help
+{
+    char name[16];
+    char help[80];
+};
+
+static struct cmdname_help common_cmds[] = {"
+
+sed -n -e 's/^perf-\([^        ]*\)[   ].* common.*/\1/p' command-list.txt |
+sort |
+while read cmd
+do
+     sed -n '
+     /^NAME/,/perf-'"$cmd"'/H
+     ${
+            x
+            s/.*perf-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
+           p
+     }' "Documentation/perf-$cmd.txt"
+done
+echo "};"
diff --git a/tools/perf/util/help.c b/tools/perf/util/help.c

new file mode 100644 (file)

index 0000000..6653f7d
--- /dev/null
+++ b/tools/perf/util/help.c
@@ -0,0 +1,367 @@
+#include "cache.h"
+#include "../builtin.h"
+#include "exec_cmd.h"
+#include "levenshtein.h"
+#include "help.h"
+
+/* most GUI terminals set COLUMNS (although some don't export it) */
+static int term_columns(void)
+{
+       char *col_string = getenv("COLUMNS");
+       int n_cols;
+
+       if (col_string && (n_cols = atoi(col_string)) > 0)
+               return n_cols;
+
+#ifdef TIOCGWINSZ
+       {
+               struct winsize ws;
+               if (!ioctl(1, TIOCGWINSZ, &ws)) {
+                       if (ws.ws_col)
+                               return ws.ws_col;
+               }
+       }
+#endif
+
+       return 80;
+}
+
+void add_cmdname(struct cmdnames *cmds, const char *name, int len)
+{
+       struct cmdname *ent = malloc(sizeof(*ent) + len + 1);
+
+       ent->len = len;
+       memcpy(ent->name, name, len);
+       ent->name[len] = 0;
+
+       ALLOC_GROW(cmds->names, cmds->cnt + 1, cmds->alloc);
+       cmds->names[cmds->cnt++] = ent;
+}
+
+static void clean_cmdnames(struct cmdnames *cmds)
+{
+       int i;
+       for (i = 0; i < cmds->cnt; ++i)
+               free(cmds->names[i]);
+       free(cmds->names);
+       cmds->cnt = 0;
+       cmds->alloc = 0;
+}
+
+static int cmdname_compare(const void *a_, const void *b_)
+{
+       struct cmdname *a = *(struct cmdname **)a_;
+       struct cmdname *b = *(struct cmdname **)b_;
+       return strcmp(a->name, b->name);
+}
+
+static void uniq(struct cmdnames *cmds)
+{
+       int i, j;
+
+       if (!cmds->cnt)
+               return;
+
+       for (i = j = 1; i < cmds->cnt; i++)
+               if (strcmp(cmds->names[i]->name, cmds->names[i-1]->name))
+                       cmds->names[j++] = cmds->names[i];
+
+       cmds->cnt = j;
+}
+
+void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes)
+{
+       int ci, cj, ei;
+       int cmp;
+
+       ci = cj = ei = 0;
+       while (ci < cmds->cnt && ei < excludes->cnt) {
+               cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name);
+               if (cmp < 0)
+                       cmds->names[cj++] = cmds->names[ci++];
+               else if (cmp == 0)
+                       ci++, ei++;
+               else if (cmp > 0)
+                       ei++;
+       }
+
+       while (ci < cmds->cnt)
+               cmds->names[cj++] = cmds->names[ci++];
+
+       cmds->cnt = cj;
+}
+
+static void pretty_print_string_list(struct cmdnames *cmds, int longest)
+{
+       int cols = 1, rows;
+       int space = longest + 1; /* min 1 SP between words */
+       int max_cols = term_columns() - 1; /* don't print *on* the edge */
+       int i, j;
+
+       if (space < max_cols)
+               cols = max_cols / space;
+       rows = (cmds->cnt + cols - 1) / cols;
+
+       for (i = 0; i < rows; i++) {
+               printf("  ");
+
+               for (j = 0; j < cols; j++) {
+                       int n = j * rows + i;
+                       int size = space;
+                       if (n >= cmds->cnt)
+                               break;
+                       if (j == cols-1 || n + rows >= cmds->cnt)
+                               size = 1;
+                       printf("%-*s", size, cmds->names[n]->name);
+               }
+               putchar('\n');
+       }
+}
+
+static int is_executable(const char *name)
+{
+       struct stat st;
+
+       if (stat(name, &st) || /* stat, not lstat */
+           !S_ISREG(st.st_mode))
+               return 0;
+
+#ifdef __MINGW32__
+       /* cannot trust the executable bit, peek into the file instead */
+       char buf[3] = { 0 };
+       int n;
+       int fd = open(name, O_RDONLY);
+       st.st_mode &= ~S_IXUSR;
+       if (fd >= 0) {
+               n = read(fd, buf, 2);
+               if (n == 2)
+                       /* DOS executables start with "MZ" */
+                       if (!strcmp(buf, "#!") || !strcmp(buf, "MZ"))
+                               st.st_mode |= S_IXUSR;
+               close(fd);
+       }
+#endif
+       return st.st_mode & S_IXUSR;
+}
+
+static void list_commands_in_dir(struct cmdnames *cmds,
+                                        const char *path,
+                                        const char *prefix)
+{
+       int prefix_len;
+       DIR *dir = opendir(path);
+       struct dirent *de;
+       struct strbuf buf = STRBUF_INIT;
+       int len;
+
+       if (!dir)
+               return;
+       if (!prefix)
+               prefix = "perf-";
+       prefix_len = strlen(prefix);
+
+       strbuf_addf(&buf, "%s/", path);
+       len = buf.len;
+
+       while ((de = readdir(dir)) != NULL) {
+               int entlen;
+
+               if (prefixcmp(de->d_name, prefix))
+                       continue;
+
+               strbuf_setlen(&buf, len);
+               strbuf_addstr(&buf, de->d_name);
+               if (!is_executable(buf.buf))
+                       continue;
+
+               entlen = strlen(de->d_name) - prefix_len;
+               if (has_extension(de->d_name, ".exe"))
+                       entlen -= 4;
+
+               add_cmdname(cmds, de->d_name + prefix_len, entlen);
+       }
+       closedir(dir);
+       strbuf_release(&buf);
+}
+
+void load_command_list(const char *prefix,
+               struct cmdnames *main_cmds,
+               struct cmdnames *other_cmds)
+{
+       const char *env_path = getenv("PATH");
+       const char *exec_path = perf_exec_path();
+
+       if (exec_path) {
+               list_commands_in_dir(main_cmds, exec_path, prefix);
+               qsort(main_cmds->names, main_cmds->cnt,
+                     sizeof(*main_cmds->names), cmdname_compare);
+               uniq(main_cmds);
+       }
+
+       if (env_path) {
+               char *paths, *path, *colon;
+               path = paths = strdup(env_path);
+               while (1) {
+                       if ((colon = strchr(path, PATH_SEP)))
+                               *colon = 0;
+                       if (!exec_path || strcmp(path, exec_path))
+                               list_commands_in_dir(other_cmds, path, prefix);
+
+                       if (!colon)
+                               break;
+                       path = colon + 1;
+               }
+               free(paths);
+
+               qsort(other_cmds->names, other_cmds->cnt,
+                     sizeof(*other_cmds->names), cmdname_compare);
+               uniq(other_cmds);
+       }
+       exclude_cmds(other_cmds, main_cmds);
+}
+
+void list_commands(const char *title, struct cmdnames *main_cmds,
+                  struct cmdnames *other_cmds)
+{
+       int i, longest = 0;
+
+       for (i = 0; i < main_cmds->cnt; i++)
+               if (longest < main_cmds->names[i]->len)
+                       longest = main_cmds->names[i]->len;
+       for (i = 0; i < other_cmds->cnt; i++)
+               if (longest < other_cmds->names[i]->len)
+                       longest = other_cmds->names[i]->len;
+
+       if (main_cmds->cnt) {
+               const char *exec_path = perf_exec_path();
+               printf("available %s in '%s'\n", title, exec_path);
+               printf("----------------");
+               mput_char('-', strlen(title) + strlen(exec_path));
+               putchar('\n');
+               pretty_print_string_list(main_cmds, longest);
+               putchar('\n');
+       }
+
+       if (other_cmds->cnt) {
+               printf("%s available from elsewhere on your $PATH\n", title);
+               printf("---------------------------------------");
+               mput_char('-', strlen(title));
+               putchar('\n');
+               pretty_print_string_list(other_cmds, longest);
+               putchar('\n');
+       }
+}
+
+int is_in_cmdlist(struct cmdnames *c, const char *s)
+{
+       int i;
+       for (i = 0; i < c->cnt; i++)
+               if (!strcmp(s, c->names[i]->name))
+                       return 1;
+       return 0;
+}
+
+static int autocorrect;
+static struct cmdnames aliases;
+
+static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
+{
+       if (!strcmp(var, "help.autocorrect"))
+               autocorrect = perf_config_int(var,value);
+       /* Also use aliases for command lookup */
+       if (!prefixcmp(var, "alias."))
+               add_cmdname(&aliases, var + 6, strlen(var + 6));
+
+       return perf_default_config(var, value, cb);
+}
+
+static int levenshtein_compare(const void *p1, const void *p2)
+{
+       const struct cmdname *const *c1 = p1, *const *c2 = p2;
+       const char *s1 = (*c1)->name, *s2 = (*c2)->name;
+       int l1 = (*c1)->len;
+       int l2 = (*c2)->len;
+       return l1 != l2 ? l1 - l2 : strcmp(s1, s2);
+}
+
+static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
+{
+       int i;
+       ALLOC_GROW(cmds->names, cmds->cnt + old->cnt, cmds->alloc);
+
+       for (i = 0; i < old->cnt; i++)
+               cmds->names[cmds->cnt++] = old->names[i];
+       free(old->names);
+       old->cnt = 0;
+       old->names = NULL;
+}
+
+const char *help_unknown_cmd(const char *cmd)
+{
+       int i, n = 0, best_similarity = 0;
+       struct cmdnames main_cmds, other_cmds;
+
+       memset(&main_cmds, 0, sizeof(main_cmds));
+       memset(&other_cmds, 0, sizeof(main_cmds));
+       memset(&aliases, 0, sizeof(aliases));
+
+       perf_config(perf_unknown_cmd_config, NULL);
+
+       load_command_list("perf-", &main_cmds, &other_cmds);
+
+       add_cmd_list(&main_cmds, &aliases);
+       add_cmd_list(&main_cmds, &other_cmds);
+       qsort(main_cmds.names, main_cmds.cnt,
+             sizeof(main_cmds.names), cmdname_compare);
+       uniq(&main_cmds);
+
+       if (main_cmds.cnt) {
+               /* This reuses cmdname->len for similarity index */
+               for (i = 0; i < main_cmds.cnt; ++i)
+                       main_cmds.names[i]->len =
+                               levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4);
+
+               qsort(main_cmds.names, main_cmds.cnt,
+                     sizeof(*main_cmds.names), levenshtein_compare);
+
+               best_similarity = main_cmds.names[0]->len;
+               n = 1;
+               while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
+                       ++n;
+       }
+
+       if (autocorrect && n == 1) {
+               const char *assumed = main_cmds.names[0]->name;
+
+               main_cmds.names[0] = NULL;
+               clean_cmdnames(&main_cmds);
+               fprintf(stderr, "WARNING: You called a Git program named '%s', "
+                       "which does not exist.\n"
+                       "Continuing under the assumption that you meant '%s'\n",
+                       cmd, assumed);
+               if (autocorrect > 0) {
+                       fprintf(stderr, "in %0.1f seconds automatically...\n",
+                               (float)autocorrect/10.0);
+                       poll(NULL, 0, autocorrect * 100);
+               }
+               return assumed;
+       }
+
+       fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd);
+
+       if (main_cmds.cnt && best_similarity < 6) {
+               fprintf(stderr, "\nDid you mean %s?\n",
+                       n < 2 ? "this": "one of these");
+
+               for (i = 0; i < n; i++)
+                       fprintf(stderr, "\t%s\n", main_cmds.names[i]->name);
+       }
+
+       exit(1);
+}
+
+int cmd_version(int argc, const char **argv, const char *prefix)
+{
+       printf("perf version %s\n", perf_version_string);
+       return 0;
+}
diff --git a/tools/perf/util/help.h b/tools/perf/util/help.h

new file mode 100644 (file)

index 0000000..56bc154
--- /dev/null
+++ b/tools/perf/util/help.h
@@ -0,0 +1,29 @@
+#ifndef HELP_H
+#define HELP_H
+
+struct cmdnames {
+       int alloc;
+       int cnt;
+       struct cmdname {
+               size_t len; /* also used for similarity index in help.c */
+               char name[FLEX_ARRAY];
+       } **names;
+};
+
+static inline void mput_char(char c, unsigned int num)
+{
+       while(num--)
+               putchar(c);
+}
+
+void load_command_list(const char *prefix,
+               struct cmdnames *main_cmds,
+               struct cmdnames *other_cmds);
+void add_cmdname(struct cmdnames *cmds, const char *name, int len);
+/* Here we require that excludes is a sorted list. */
+void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes);
+int is_in_cmdlist(struct cmdnames *c, const char *s);
+void list_commands(const char *title, struct cmdnames *main_cmds,
+                  struct cmdnames *other_cmds);
+
+#endif /* HELP_H */
diff --git a/tools/perf/util/levenshtein.c b/tools/perf/util/levenshtein.c

new file mode 100644 (file)

index 0000000..e521d15
--- /dev/null
+++ b/tools/perf/util/levenshtein.c
@@ -0,0 +1,84 @@
+#include "cache.h"
+#include "levenshtein.h"
+
+/*
+ * This function implements the Damerau-Levenshtein algorithm to
+ * calculate a distance between strings.
+ *
+ * Basically, it says how many letters need to be swapped, substituted,
+ * deleted from, or added to string1, at least, to get string2.
+ *
+ * The idea is to build a distance matrix for the substrings of both
+ * strings.  To avoid a large space complexity, only the last three rows
+ * are kept in memory (if swaps had the same or higher cost as one deletion
+ * plus one insertion, only two rows would be needed).
+ *
+ * At any stage, "i + 1" denotes the length of the current substring of
+ * string1 that the distance is calculated for.
+ *
+ * row2 holds the current row, row1 the previous row (i.e. for the substring
+ * of string1 of length "i"), and row0 the row before that.
+ *
+ * In other words, at the start of the big loop, row2[j + 1] contains the
+ * Damerau-Levenshtein distance between the substring of string1 of length
+ * "i" and the substring of string2 of length "j + 1".
+ *
+ * All the big loop does is determine the partial minimum-cost paths.
+ *
+ * It does so by calculating the costs of the path ending in characters
+ * i (in string1) and j (in string2), respectively, given that the last
+ * operation is a substition, a swap, a deletion, or an insertion.
+ *
+ * This implementation allows the costs to be weighted:
+ *
+ * - w (as in "sWap")
+ * - s (as in "Substitution")
+ * - a (for insertion, AKA "Add")
+ * - d (as in "Deletion")
+ *
+ * Note that this algorithm calculates a distance _iff_ d == a.
+ */
+int levenshtein(const char *string1, const char *string2,
+               int w, int s, int a, int d)
+{
+       int len1 = strlen(string1), len2 = strlen(string2);
+       int *row0 = malloc(sizeof(int) * (len2 + 1));
+       int *row1 = malloc(sizeof(int) * (len2 + 1));
+       int *row2 = malloc(sizeof(int) * (len2 + 1));
+       int i, j;
+
+       for (j = 0; j <= len2; j++)
+               row1[j] = j * a;
+       for (i = 0; i < len1; i++) {
+               int *dummy;
+
+               row2[0] = (i + 1) * d;
+               for (j = 0; j < len2; j++) {
+                       /* substitution */
+                       row2[j + 1] = row1[j] + s * (string1[i] != string2[j]);
+                       /* swap */
+                       if (i > 0 && j > 0 && string1[i - 1] == string2[j] &&
+                                       string1[i] == string2[j - 1] &&
+                                       row2[j + 1] > row0[j - 1] + w)
+                               row2[j + 1] = row0[j - 1] + w;
+                       /* deletion */
+                       if (row2[j + 1] > row1[j + 1] + d)
+                               row2[j + 1] = row1[j + 1] + d;
+                       /* insertion */
+                       if (row2[j + 1] > row2[j] + a)
+                               row2[j + 1] = row2[j] + a;
+               }
+
+               dummy = row0;
+               row0 = row1;
+               row1 = row2;
+               row2 = dummy;
+       }
+
+       i = row1[len2];
+       free(row0);
+       free(row1);
+       free(row2);
+
+       return i;
+}
diff --git a/tools/perf/util/levenshtein.h b/tools/perf/util/levenshtein.h

new file mode 100644 (file)

index 0000000..0173abe
--- /dev/null
+++ b/tools/perf/util/levenshtein.h
@@ -0,0 +1,8 @@
+#ifndef LEVENSHTEIN_H
+#define LEVENSHTEIN_H
+
+int levenshtein(const char *string1, const char *string2,
+       int swap_penalty, int substition_penalty,
+       int insertion_penalty, int deletion_penalty);
+
+#endif
diff --git a/tools/perf/util/list.h b/tools/perf/util/list.h

new file mode 100644 (file)

index 0000000..e2548e8
--- /dev/null
+++ b/tools/perf/util/list.h
@@ -0,0 +1,603 @@
+#ifndef _LINUX_LIST_H
+#define _LINUX_LIST_H
+/*
+  Copyright (C) Cast of dozens, comes from the Linux kernel
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+*/
+
+#include <stddef.h>
+
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1 ((void *)0x00100100)
+#define LIST_POISON2 ((void *)0x00200200)
+
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:       the pointer to the member.
+ * @type:      the type of the container struct this is embedded in.
+ * @member:    the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({                     \
+        const typeof( ((type *)0)->member ) *__mptr = (ptr);   \
+        (type *)( (char *)__mptr - offsetof(type,member) );})
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct list_head {
+       struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+       struct list_head name = LIST_HEAD_INIT(name)
+
+static inline void INIT_LIST_HEAD(struct list_head *list)
+{
+       list->next = list;
+       list->prev = list;
+}
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_add(struct list_head *new,
+                             struct list_head *prev,
+                             struct list_head *next)
+{
+       next->prev = new;
+       new->next = next;
+       new->prev = prev;
+       prev->next = new;
+}
+
+/**
+ * list_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+       __list_add(new, head, head->next);
+}
+
+/**
+ * list_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+       __list_add(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+       next->prev = prev;
+       prev->next = next;
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is
+ * in an undefined state.
+ */
+static inline void list_del(struct list_head *entry)
+{
+       __list_del(entry->prev, entry->next);
+       entry->next = LIST_POISON1;
+       entry->prev = LIST_POISON2;
+}
+
+/**
+ * list_del_range - deletes range of entries from list.
+ * @beging: first element in the range to delete from the list.
+ * @beging: first element in the range to delete from the list.
+ * Note: list_empty on the range of entries does not return true after this,
+ * the entries is in an undefined state.
+ */
+static inline void list_del_range(struct list_head *begin,
+                                 struct list_head *end)
+{
+       begin->prev->next = end->next;
+       end->next->prev = begin->prev;
+}
+
+/**
+ * list_replace - replace old entry by new one
+ * @old : the element to be replaced
+ * @new : the new element to insert
+ * Note: if 'old' was empty, it will be overwritten.
+ */
+static inline void list_replace(struct list_head *old,
+                               struct list_head *new)
+{
+       new->next = old->next;
+       new->next->prev = new;
+       new->prev = old->prev;
+       new->prev->next = new;
+}
+
+static inline void list_replace_init(struct list_head *old,
+                                       struct list_head *new)
+{
+       list_replace(old, new);
+       INIT_LIST_HEAD(old);
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+       __list_del(entry->prev, entry->next);
+       INIT_LIST_HEAD(entry);
+}
+
+/**
+ * list_move - delete from one list and add as another's head
+ * @list: the entry to move
+ * @head: the head that will precede our entry
+ */
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+        __list_del(list->prev, list->next);
+        list_add(list, head);
+}
+
+/**
+ * list_move_tail - delete from one list and add as another's tail
+ * @list: the entry to move
+ * @head: the head that will follow our entry
+ */
+static inline void list_move_tail(struct list_head *list,
+                                 struct list_head *head)
+{
+        __list_del(list->prev, list->next);
+        list_add_tail(list, head);
+}
+
+/**
+ * list_is_last - tests whether @list is the last entry in list @head
+ * @list: the entry to test
+ * @head: the head of the list
+ */
+static inline int list_is_last(const struct list_head *list,
+                               const struct list_head *head)
+{
+       return list->next == head;
+}
+
+/**
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static inline int list_empty(const struct list_head *head)
+{
+       return head->next == head;
+}
+
+/**
+ * list_empty_careful - tests whether a list is empty and not being modified
+ * @head: the list to test
+ *
+ * Description:
+ * tests whether a list is empty _and_ checks that no other CPU might be
+ * in the process of modifying either member (next or prev)
+ *
+ * NOTE: using list_empty_careful() without synchronization
+ * can only be safe if the only activity that can happen
+ * to the list entry is list_del_init(). Eg. it cannot be used
+ * if another CPU could re-list_add() it.
+ */
+static inline int list_empty_careful(const struct list_head *head)
+{
+       struct list_head *next = head->next;
+       return (next == head) && (next == head->prev);
+}
+
+static inline void __list_splice(struct list_head *list,
+                                struct list_head *head)
+{
+       struct list_head *first = list->next;
+       struct list_head *last = list->prev;
+       struct list_head *at = head->next;
+
+       first->prev = head;
+       head->next = first;
+
+       last->next = at;
+       at->prev = last;
+}
+
+/**
+ * list_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static inline void list_splice(struct list_head *list, struct list_head *head)
+{
+       if (!list_empty(list))
+               __list_splice(list, head);
+}
+
+/**
+ * list_splice_init - join two lists and reinitialise the emptied list.
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_init(struct list_head *list,
+                                   struct list_head *head)
+{
+       if (!list_empty(list)) {
+               __list_splice(list, head);
+               INIT_LIST_HEAD(list);
+       }
+}
+
+/**
+ * list_entry - get the struct for this entry
+ * @ptr:       the &struct list_head pointer.
+ * @type:      the type of the struct this is embedded in.
+ * @member:    the name of the list_struct within the struct.
+ */
+#define list_entry(ptr, type, member) \
+       container_of(ptr, type, member)
+
+/**
+ * list_first_entry - get the first element from a list
+ * @ptr:       the list head to take the element from.
+ * @type:      the type of the struct this is embedded in.
+ * @member:    the name of the list_struct within the struct.
+ *
+ * Note, that list is expected to be not empty.
+ */
+#define list_first_entry(ptr, type, member) \
+       list_entry((ptr)->next, type, member)
+
+/**
+ * list_for_each       -       iterate over a list
+ * @pos:       the &struct list_head to use as a loop cursor.
+ * @head:      the head for your list.
+ */
+#define list_for_each(pos, head) \
+       for (pos = (head)->next; pos != (head); \
+               pos = pos->next)
+
+/**
+ * __list_for_each     -       iterate over a list
+ * @pos:       the &struct list_head to use as a loop cursor.
+ * @head:      the head for your list.
+ *
+ * This variant differs from list_for_each() in that it's the
+ * simplest possible list iteration code, no prefetching is done.
+ * Use this for code that knows the list to be very short (empty
+ * or 1 entry) most of the time.
+ */
+#define __list_for_each(pos, head) \
+       for (pos = (head)->next; pos != (head); pos = pos->next)
+
+/**
+ * list_for_each_prev  -       iterate over a list backwards
+ * @pos:       the &struct list_head to use as a loop cursor.
+ * @head:      the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+       for (pos = (head)->prev; pos != (head); \
+               pos = pos->prev)
+
+/**
+ * list_for_each_safe - iterate over a list safe against removal of list entry
+ * @pos:       the &struct list_head to use as a loop cursor.
+ * @n:         another &struct list_head to use as temporary storage
+ * @head:      the head for your list.
+ */
+#define list_for_each_safe(pos, n, head) \
+       for (pos = (head)->next, n = pos->next; pos != (head); \
+               pos = n, n = pos->next)
+
+/**
+ * list_for_each_entry -       iterate over list of given type
+ * @pos:       the type * to use as a loop cursor.
+ * @head:      the head for your list.
+ * @member:    the name of the list_struct within the struct.
+ */
+#define list_for_each_entry(pos, head, member)                         \
+       for (pos = list_entry((head)->next, typeof(*pos), member);      \
+            &pos->member != (head);    \
+            pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_reverse - iterate backwards over list of given type.
+ * @pos:       the type * to use as a loop cursor.
+ * @head:      the head for your list.
+ * @member:    the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_reverse(pos, head, member)                 \
+       for (pos = list_entry((head)->prev, typeof(*pos), member);      \
+            &pos->member != (head);    \
+            pos = list_entry(pos->member.prev, typeof(*pos), member))
+
+/**
+ * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue
+ * @pos:       the type * to use as a start point
+ * @head:      the head of the list
+ * @member:    the name of the list_struct within the struct.
+ *
+ * Prepares a pos entry for use as a start point in list_for_each_entry_continue.
+ */
+#define list_prepare_entry(pos, head, member) \
+       ((pos) ? : list_entry(head, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_continue - continue iteration over list of given type
+ * @pos:       the type * to use as a loop cursor.
+ * @head:      the head for your list.
+ * @member:    the name of the list_struct within the struct.
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position.
+ */
+#define list_for_each_entry_continue(pos, head, member)                \
+       for (pos = list_entry(pos->member.next, typeof(*pos), member);  \
+            &pos->member != (head);    \
+            pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_from - iterate over list of given type from the current point
+ * @pos:       the type * to use as a loop cursor.
+ * @head:      the head for your list.
+ * @member:    the name of the list_struct within the struct.
+ *
+ * Iterate over list of given type, continuing from current position.
+ */
+#define list_for_each_entry_from(pos, head, member)                    \
+       for (; &pos->member != (head);  \
+            pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @pos:       the type * to use as a loop cursor.
+ * @n:         another type * to use as temporary storage
+ * @head:      the head for your list.
+ * @member:    the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member)                 \
+       for (pos = list_entry((head)->next, typeof(*pos), member),      \
+               n = list_entry(pos->member.next, typeof(*pos), member); \
+            &pos->member != (head);                                    \
+            pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+/**
+ * list_for_each_entry_safe_continue
+ * @pos:       the type * to use as a loop cursor.
+ * @n:         another type * to use as temporary storage
+ * @head:      the head for your list.
+ * @member:    the name of the list_struct within the struct.
+ *
+ * Iterate over list of given type, continuing after current point,
+ * safe against removal of list entry.
+ */
+#define list_for_each_entry_safe_continue(pos, n, head, member)                \
+       for (pos = list_entry(pos->member.next, typeof(*pos), member),          \
+               n = list_entry(pos->member.next, typeof(*pos), member);         \
+            &pos->member != (head);                                            \
+            pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+/**
+ * list_for_each_entry_safe_from
+ * @pos:       the type * to use as a loop cursor.
+ * @n:         another type * to use as temporary storage
+ * @head:      the head for your list.
+ * @member:    the name of the list_struct within the struct.
+ *
+ * Iterate over list of given type from current point, safe against
+ * removal of list entry.
+ */
+#define list_for_each_entry_safe_from(pos, n, head, member)                    \
+       for (n = list_entry(pos->member.next, typeof(*pos), member);            \
+            &pos->member != (head);                                            \
+            pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+/**
+ * list_for_each_entry_safe_reverse
+ * @pos:       the type * to use as a loop cursor.
+ * @n:         another type * to use as temporary storage
+ * @head:      the head for your list.
+ * @member:    the name of the list_struct within the struct.
+ *
+ * Iterate backwards over list of given type, safe against removal
+ * of list entry.
+ */
+#define list_for_each_entry_safe_reverse(pos, n, head, member)         \
+       for (pos = list_entry((head)->prev, typeof(*pos), member),      \
+               n = list_entry(pos->member.prev, typeof(*pos), member); \
+            &pos->member != (head);                                    \
+            pos = n, n = list_entry(n->member.prev, typeof(*n), member))
+
+/*
+ * Double linked lists with a single pointer list head.
+ * Mostly useful for hash tables where the two pointer list head is
+ * too wasteful.
+ * You lose the ability to access the tail in O(1).
+ */
+
+struct hlist_head {
+       struct hlist_node *first;
+};
+
+struct hlist_node {
+       struct hlist_node *next, **pprev;
+};
+
+#define HLIST_HEAD_INIT { .first = NULL }
+#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
+#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
+static inline void INIT_HLIST_NODE(struct hlist_node *h)
+{
+       h->next = NULL;
+       h->pprev = NULL;
+}
+
+static inline int hlist_unhashed(const struct hlist_node *h)
+{
+       return !h->pprev;
+}
+
+static inline int hlist_empty(const struct hlist_head *h)
+{
+       return !h->first;
+}
+
+static inline void __hlist_del(struct hlist_node *n)
+{
+       struct hlist_node *next = n->next;
+       struct hlist_node **pprev = n->pprev;
+       *pprev = next;
+       if (next)
+               next->pprev = pprev;
+}
+
+static inline void hlist_del(struct hlist_node *n)
+{
+       __hlist_del(n);
+       n->next = LIST_POISON1;
+       n->pprev = LIST_POISON2;
+}
+
+static inline void hlist_del_init(struct hlist_node *n)
+{
+       if (!hlist_unhashed(n)) {
+               __hlist_del(n);
+               INIT_HLIST_NODE(n);
+       }
+}
+
+static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
+{
+       struct hlist_node *first = h->first;
+       n->next = first;
+       if (first)
+               first->pprev = &n->next;
+       h->first = n;
+       n->pprev = &h->first;
+}
+
+/* next must be != NULL */
+static inline void hlist_add_before(struct hlist_node *n,
+                                       struct hlist_node *next)
+{
+       n->pprev = next->pprev;
+       n->next = next;
+       next->pprev = &n->next;
+       *(n->pprev) = n;
+}
+
+static inline void hlist_add_after(struct hlist_node *n,
+                                       struct hlist_node *next)
+{
+       next->next = n->next;
+       n->next = next;
+       next->pprev = &n->next;
+
+       if(next->next)
+               next->next->pprev  = &next->next;
+}
+
+#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_for_each(pos, head) \
+       for (pos = (head)->first; pos; \
+            pos = pos->next)
+
+#define hlist_for_each_safe(pos, n, head) \
+       for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
+            pos = n)
+
+/**
+ * hlist_for_each_entry        - iterate over list of given type
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct hlist_node to use as a loop cursor.
+ * @head:      the head for your list.
+ * @member:    the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry(tpos, pos, head, member)                   \
+       for (pos = (head)->first;                                        \
+            pos &&                      \
+               ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+            pos = pos->next)
+
+/**
+ * hlist_for_each_entry_continue - iterate over a hlist continuing after current point
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct hlist_node to use as a loop cursor.
+ * @member:    the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_continue(tpos, pos, member)                \
+       for (pos = (pos)->next;                                          \
+            pos &&                      \
+               ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+            pos = pos->next)
+
+/**
+ * hlist_for_each_entry_from - iterate over a hlist continuing from current point
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct hlist_node to use as a loop cursor.
+ * @member:    the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_from(tpos, pos, member)                    \
+       for (; pos &&                    \
+               ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+            pos = pos->next)
+
+/**
+ * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct hlist_node to use as a loop cursor.
+ * @n:         another &struct hlist_node to use as temporary storage
+ * @head:      the head for your list.
+ * @member:    the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_safe(tpos, pos, n, head, member)           \
+       for (pos = (head)->first;                                        \
+            pos && ({ n = pos->next; 1; }) &&                           \
+               ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+            pos = n)
+
+#endif
diff --git a/tools/perf/util/pager.c b/tools/perf/util/pager.c

new file mode 100644 (file)

index 0000000..a28bcca
--- /dev/null
+++ b/tools/perf/util/pager.c
@@ -0,0 +1,99 @@
+#include "cache.h"
+#include "run-command.h"
+#include "sigchain.h"
+
+/*
+ * This is split up from the rest of git so that we can do
+ * something different on Windows.
+ */
+
+static int spawned_pager;
+
+#ifndef __MINGW32__
+static void pager_preexec(void)
+{
+       /*
+        * Work around bug in "less" by not starting it until we
+        * have real input
+        */
+       fd_set in;
+
+       FD_ZERO(&in);
+       FD_SET(0, &in);
+       select(1, &in, NULL, &in, NULL);
+
+       setenv("LESS", "FRSX", 0);
+}
+#endif
+
+static const char *pager_argv[] = { "sh", "-c", NULL, NULL };
+static struct child_process pager_process;
+
+static void wait_for_pager(void)
+{
+       fflush(stdout);
+       fflush(stderr);
+       /* signal EOF to pager */
+       close(1);
+       close(2);
+       finish_command(&pager_process);
+}
+
+static void wait_for_pager_signal(int signo)
+{
+       wait_for_pager();
+       sigchain_pop(signo);
+       raise(signo);
+}
+
+void setup_pager(void)
+{
+       const char *pager = getenv("PERF_PAGER");
+
+       if (!isatty(1))
+               return;
+       if (!pager) {
+               if (!pager_program)
+                       perf_config(perf_default_config, NULL);
+               pager = pager_program;
+       }
+       if (!pager)
+               pager = getenv("PAGER");
+       if (!pager)
+               pager = "less";
+       else if (!*pager || !strcmp(pager, "cat"))
+               return;
+
+       spawned_pager = 1; /* means we are emitting to terminal */
+
+       /* spawn the pager */
+       pager_argv[2] = pager;
+       pager_process.argv = pager_argv;
+       pager_process.in = -1;
+#ifndef __MINGW32__
+       pager_process.preexec_cb = pager_preexec;
+#endif
+       if (start_command(&pager_process))
+               return;
+
+       /* original process continues, but writes to the pipe */
+       dup2(pager_process.in, 1);
+       if (isatty(2))
+               dup2(pager_process.in, 2);
+       close(pager_process.in);
+
+       /* this makes sure that the parent terminates after the pager */
+       sigchain_push_common(wait_for_pager_signal);
+       atexit(wait_for_pager);
+}
+
+int pager_in_use(void)
+{
+       const char *env;
+
+       if (spawned_pager)
+               return 1;
+
+       env = getenv("PERF_PAGER_IN_USE");
+       return env ? perf_config_bool("PERF_PAGER_IN_USE", env) : 0;
+}
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c

new file mode 100644 (file)

index 0000000..9d5f1ca
--- /dev/null
+++ b/tools/perf/util/parse-events.c
@@ -0,0 +1,316 @@
+
+#include "../perf.h"
+#include "util.h"
+#include "parse-options.h"
+#include "parse-events.h"
+#include "exec_cmd.h"
+#include "string.h"
+
+extern char *strcasestr(const char *haystack, const char *needle);
+
+int                                    nr_counters;
+
+struct perf_counter_attr               attrs[MAX_COUNTERS];
+
+struct event_symbol {
+       __u8    type;
+       __u64   config;
+       char    *symbol;
+};
+
+#define C(x, y) .type = PERF_TYPE_##x, .config = PERF_COUNT_##y
+#define CR(x, y) .type = PERF_TYPE_##x, .config = y
+
+static struct event_symbol event_symbols[] = {
+  { C(HARDWARE, HW_CPU_CYCLES),                "cpu-cycles",           },
+  { C(HARDWARE, HW_CPU_CYCLES),                "cycles",               },
+  { C(HARDWARE, HW_INSTRUCTIONS),      "instructions",         },
+  { C(HARDWARE, HW_CACHE_REFERENCES),  "cache-references",     },
+  { C(HARDWARE, HW_CACHE_MISSES),      "cache-misses",         },
+  { C(HARDWARE, HW_BRANCH_INSTRUCTIONS),"branch-instructions", },
+  { C(HARDWARE, HW_BRANCH_INSTRUCTIONS),"branches",            },
+  { C(HARDWARE, HW_BRANCH_MISSES),     "branch-misses",        },
+  { C(HARDWARE, HW_BUS_CYCLES),                "bus-cycles",           },
+
+  { C(SOFTWARE, SW_CPU_CLOCK),         "cpu-clock",            },
+  { C(SOFTWARE, SW_TASK_CLOCK),                "task-clock",           },
+  { C(SOFTWARE, SW_PAGE_FAULTS),       "page-faults",          },
+  { C(SOFTWARE, SW_PAGE_FAULTS),       "faults",               },
+  { C(SOFTWARE, SW_PAGE_FAULTS_MIN),   "minor-faults",         },
+  { C(SOFTWARE, SW_PAGE_FAULTS_MAJ),   "major-faults",         },
+  { C(SOFTWARE, SW_CONTEXT_SWITCHES),  "context-switches",     },
+  { C(SOFTWARE, SW_CONTEXT_SWITCHES),  "cs",                   },
+  { C(SOFTWARE, SW_CPU_MIGRATIONS),    "cpu-migrations",       },
+  { C(SOFTWARE, SW_CPU_MIGRATIONS),    "migrations",           },
+};
+
+#define __PERF_COUNTER_FIELD(config, name) \
+       ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW(config)       __PERF_COUNTER_FIELD(config, RAW)
+#define PERF_COUNTER_CONFIG(config)    __PERF_COUNTER_FIELD(config, CONFIG)
+#define PERF_COUNTER_TYPE(config)      __PERF_COUNTER_FIELD(config, TYPE)
+#define PERF_COUNTER_ID(config)                __PERF_COUNTER_FIELD(config, EVENT)
+
+static char *hw_event_names[] = {
+       "cycles",
+       "instructions",
+       "cache-references",
+       "cache-misses",
+       "branches",
+       "branch-misses",
+       "bus-cycles",
+};
+
+static char *sw_event_names[] = {
+       "cpu-clock-ticks",
+       "task-clock-ticks",
+       "page-faults",
+       "context-switches",
+       "CPU-migrations",
+       "minor-faults",
+       "major-faults",
+};
+
+#define MAX_ALIASES 8
+
+static char *hw_cache [][MAX_ALIASES] = {
+       { "L1-data"             , "l1-d", "l1d", "l1"                           },
+       { "L1-instruction"      , "l1-i", "l1i"                                 },
+       { "L2"                  , "l2"                                          },
+       { "Data-TLB"            , "dtlb", "d-tlb"                               },
+       { "Instruction-TLB"     , "itlb", "i-tlb"                               },
+       { "Branch"              , "bpu" , "btb", "bpc"                          },
+};
+
+static char *hw_cache_op [][MAX_ALIASES] = {
+       { "Load"                , "read"                                        },
+       { "Store"               , "write"                                       },
+       { "Prefetch"            , "speculative-read", "speculative-load"        },
+};
+
+static char *hw_cache_result [][MAX_ALIASES] = {
+       { "Reference"           , "ops", "access"                               },
+       { "Miss"                                                                },
+};
+
+char *event_name(int counter)
+{
+       __u64 config = attrs[counter].config;
+       int type = attrs[counter].type;
+       static char buf[32];
+
+       if (attrs[counter].type == PERF_TYPE_RAW) {
+               sprintf(buf, "raw 0x%llx", config);
+               return buf;
+       }
+
+       switch (type) {
+       case PERF_TYPE_HARDWARE:
+               if (config < PERF_COUNT_HW_MAX)
+                       return hw_event_names[config];
+               return "unknown-hardware";
+
+       case PERF_TYPE_HW_CACHE: {
+               __u8 cache_type, cache_op, cache_result;
+               static char name[100];
+
+               cache_type   = (config >>  0) & 0xff;
+               if (cache_type > PERF_COUNT_HW_CACHE_MAX)
+                       return "unknown-ext-hardware-cache-type";
+
+               cache_op     = (config >>  8) & 0xff;
+               if (cache_op > PERF_COUNT_HW_CACHE_OP_MAX)
+                       return "unknown-ext-hardware-cache-op";
+
+               cache_result = (config >> 16) & 0xff;
+               if (cache_result > PERF_COUNT_HW_CACHE_RESULT_MAX)
+                       return "unknown-ext-hardware-cache-result";
+
+               sprintf(name, "%s-Cache-%s-%ses",
+                       hw_cache[cache_type][0],
+                       hw_cache_op[cache_op][0],
+                       hw_cache_result[cache_result][0]);
+
+               return name;
+       }
+
+       case PERF_TYPE_SOFTWARE:
+               if (config < PERF_COUNT_SW_MAX)
+                       return sw_event_names[config];
+               return "unknown-software";
+
+       default:
+               break;
+       }
+
+       return "unknown";
+}
+
+static int parse_aliases(const char *str, char *names[][MAX_ALIASES], int size)
+{
+       int i, j;
+
+       for (i = 0; i < size; i++) {
+               for (j = 0; j < MAX_ALIASES; j++) {
+                       if (!names[i][j])
+                               break;
+                       if (strcasestr(str, names[i][j]))
+                               return i;
+               }
+       }
+
+       return -1;
+}
+
+static int parse_generic_hw_symbols(const char *str, struct perf_counter_attr *attr)
+{
+       int cache_type = -1, cache_op = 0, cache_result = 0;
+
+       cache_type = parse_aliases(str, hw_cache, PERF_COUNT_HW_CACHE_MAX);
+       /*
+        * No fallback - if we cannot get a clear cache type
+        * then bail out:
+        */
+       if (cache_type == -1)
+               return -EINVAL;
+
+       cache_op = parse_aliases(str, hw_cache_op, PERF_COUNT_HW_CACHE_OP_MAX);
+       /*
+        * Fall back to reads:
+        */
+       if (cache_op == -1)
+               cache_op = PERF_COUNT_HW_CACHE_OP_READ;
+
+       cache_result = parse_aliases(str, hw_cache_result,
+                                       PERF_COUNT_HW_CACHE_RESULT_MAX);
+       /*
+        * Fall back to accesses:
+        */
+       if (cache_result == -1)
+               cache_result = PERF_COUNT_HW_CACHE_RESULT_ACCESS;
+
+       attr->config = cache_type | (cache_op << 8) | (cache_result << 16);
+       attr->type = PERF_TYPE_HW_CACHE;
+
+       return 0;
+}
+
+/*
+ * Each event can have multiple symbolic names.
+ * Symbolic names are (almost) exactly matched.
+ */
+static int parse_event_symbols(const char *str, struct perf_counter_attr *attr)
+{
+       __u64 config, id;
+       int type;
+       unsigned int i;
+       const char *sep, *pstr;
+
+       if (str[0] == 'r' && hex2u64(str + 1, &config) > 0) {
+               attr->type = PERF_TYPE_RAW;
+               attr->config = config;
+
+               return 0;
+       }
+
+       pstr = str;
+       sep = strchr(pstr, ':');
+       if (sep) {
+               type = atoi(pstr);
+               pstr = sep + 1;
+               id = atoi(pstr);
+               sep = strchr(pstr, ':');
+               if (sep) {
+                       pstr = sep + 1;
+                       if (strchr(pstr, 'k'))
+                               attr->exclude_user = 1;
+                       if (strchr(pstr, 'u'))
+                               attr->exclude_kernel = 1;
+               }
+               attr->type = type;
+               attr->config = id;
+
+               return 0;
+       }
+
+       for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+               if (!strncmp(str, event_symbols[i].symbol,
+                            strlen(event_symbols[i].symbol))) {
+
+                       attr->type = event_symbols[i].type;
+                       attr->config = event_symbols[i].config;
+
+                       return 0;
+               }
+       }
+
+       return parse_generic_hw_symbols(str, attr);
+}
+
+int parse_events(const struct option *opt, const char *str, int unset)
+{
+       struct perf_counter_attr attr;
+       int ret;
+
+       memset(&attr, 0, sizeof(attr));
+again:
+       if (nr_counters == MAX_COUNTERS)
+               return -1;
+
+       ret = parse_event_symbols(str, &attr);
+       if (ret < 0)
+               return ret;
+
+       attrs[nr_counters] = attr;
+       nr_counters++;
+
+       str = strstr(str, ",");
+       if (str) {
+               str++;
+               goto again;
+       }
+
+       return 0;
+}
+
+static const char * const event_type_descriptors[] = {
+       "",
+       "Hardware event",
+       "Software event",
+       "Tracepoint event",
+       "Hardware cache event",
+};
+
+/*
+ * Print the help text for the event symbols:
+ */
+void print_events(void)
+{
+       struct event_symbol *syms = event_symbols;
+       unsigned int i, type, prev_type = -1;
+
+       fprintf(stderr, "\n");
+       fprintf(stderr, "List of pre-defined events (to be used in -e):\n");
+
+       for (i = 0; i < ARRAY_SIZE(event_symbols); i++, syms++) {
+               type = syms->type + 1;
+               if (type > ARRAY_SIZE(event_type_descriptors))
+                       type = 0;
+
+               if (type != prev_type)
+                       fprintf(stderr, "\n");
+
+               fprintf(stderr, "  %-30s [%s]\n", syms->symbol,
+                       event_type_descriptors[type]);
+
+               prev_type = type;
+       }
+
+       fprintf(stderr, "\n");
+       fprintf(stderr, "  %-30s [raw hardware event descriptor]\n",
+               "rNNN");
+       fprintf(stderr, "\n");
+
+       exit(129);
+}
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h

new file mode 100644 (file)

index 0000000..e3d5529
--- /dev/null
+++ b/tools/perf/util/parse-events.h
@@ -0,0 +1,17 @@
+
+/*
+ * Parse symbolic events/counts passed in as options:
+ */
+
+extern int                     nr_counters;
+
+extern struct perf_counter_attr attrs[MAX_COUNTERS];
+
+extern char *event_name(int ctr);
+
+extern int parse_events(const struct option *opt, const char *str, int unset);
+
+#define EVENTS_HELP_MAX (128*1024)
+
+extern void print_events(void);
+
diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c

new file mode 100644 (file)

index 0000000..b3affb1
--- /dev/null
+++ b/tools/perf/util/parse-options.c
@@ -0,0 +1,508 @@
+#include "util.h"
+#include "parse-options.h"
+#include "cache.h"
+
+#define OPT_SHORT 1
+#define OPT_UNSET 2
+
+static int opterror(const struct option *opt, const char *reason, int flags)
+{
+       if (flags & OPT_SHORT)
+               return error("switch `%c' %s", opt->short_name, reason);
+       if (flags & OPT_UNSET)
+               return error("option `no-%s' %s", opt->long_name, reason);
+       return error("option `%s' %s", opt->long_name, reason);
+}
+
+static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
+                  int flags, const char **arg)
+{
+       if (p->opt) {
+               *arg = p->opt;
+               p->opt = NULL;
+       } else if (p->argc == 1 && (opt->flags & PARSE_OPT_LASTARG_DEFAULT)) {
+               *arg = (const char *)opt->defval;
+       } else if (p->argc > 1) {
+               p->argc--;
+               *arg = *++p->argv;
+       } else
+               return opterror(opt, "requires a value", flags);
+       return 0;
+}
+
+static int get_value(struct parse_opt_ctx_t *p,
+                    const struct option *opt, int flags)
+{
+       const char *s, *arg = NULL;
+       const int unset = flags & OPT_UNSET;
+
+       if (unset && p->opt)
+               return opterror(opt, "takes no value", flags);
+       if (unset && (opt->flags & PARSE_OPT_NONEG))
+               return opterror(opt, "isn't available", flags);
+
+       if (!(flags & OPT_SHORT) && p->opt) {
+               switch (opt->type) {
+               case OPTION_CALLBACK:
+                       if (!(opt->flags & PARSE_OPT_NOARG))
+                               break;
+                       /* FALLTHROUGH */
+               case OPTION_BOOLEAN:
+               case OPTION_BIT:
+               case OPTION_SET_INT:
+               case OPTION_SET_PTR:
+                       return opterror(opt, "takes no value", flags);
+               default:
+                       break;
+               }
+       }
+
+       switch (opt->type) {
+       case OPTION_BIT:
+               if (unset)
+                       *(int *)opt->value &= ~opt->defval;
+               else
+                       *(int *)opt->value |= opt->defval;
+               return 0;
+
+       case OPTION_BOOLEAN:
+               *(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
+               return 0;
+
+       case OPTION_SET_INT:
+               *(int *)opt->value = unset ? 0 : opt->defval;
+               return 0;
+
+       case OPTION_SET_PTR:
+               *(void **)opt->value = unset ? NULL : (void *)opt->defval;
+               return 0;
+
+       case OPTION_STRING:
+               if (unset)
+                       *(const char **)opt->value = NULL;
+               else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+                       *(const char **)opt->value = (const char *)opt->defval;
+               else
+                       return get_arg(p, opt, flags, (const char **)opt->value);
+               return 0;
+
+       case OPTION_CALLBACK:
+               if (unset)
+                       return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
+               if (opt->flags & PARSE_OPT_NOARG)
+                       return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+               if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+                       return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+               if (get_arg(p, opt, flags, &arg))
+                       return -1;
+               return (*opt->callback)(opt, arg, 0) ? (-1) : 0;
+
+       case OPTION_INTEGER:
+               if (unset) {
+                       *(int *)opt->value = 0;
+                       return 0;
+               }
+               if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+                       *(int *)opt->value = opt->defval;
+                       return 0;
+               }
+               if (get_arg(p, opt, flags, &arg))
+                       return -1;
+               *(int *)opt->value = strtol(arg, (char **)&s, 10);
+               if (*s)
+                       return opterror(opt, "expects a numerical value", flags);
+               return 0;
+
+       case OPTION_LONG:
+               if (unset) {
+                       *(long *)opt->value = 0;
+                       return 0;
+               }
+               if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+                       *(long *)opt->value = opt->defval;
+                       return 0;
+               }
+               if (get_arg(p, opt, flags, &arg))
+                       return -1;
+               *(long *)opt->value = strtol(arg, (char **)&s, 10);
+               if (*s)
+                       return opterror(opt, "expects a numerical value", flags);
+               return 0;
+
+       default:
+               die("should not happen, someone must be hit on the forehead");
+       }
+}
+
+static int parse_short_opt(struct parse_opt_ctx_t *p, const struct option *options)
+{
+       for (; options->type != OPTION_END; options++) {
+               if (options->short_name == *p->opt) {
+                       p->opt = p->opt[1] ? p->opt + 1 : NULL;
+                       return get_value(p, options, OPT_SHORT);
+               }
+       }
+       return -2;
+}
+
+static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg,
+                          const struct option *options)
+{
+       const char *arg_end = strchr(arg, '=');
+       const struct option *abbrev_option = NULL, *ambiguous_option = NULL;
+       int abbrev_flags = 0, ambiguous_flags = 0;
+
+       if (!arg_end)
+               arg_end = arg + strlen(arg);
+
+       for (; options->type != OPTION_END; options++) {
+               const char *rest;
+               int flags = 0;
+
+               if (!options->long_name)
+                       continue;
+
+               rest = skip_prefix(arg, options->long_name);
+               if (options->type == OPTION_ARGUMENT) {
+                       if (!rest)
+                               continue;
+                       if (*rest == '=')
+                               return opterror(options, "takes no value", flags);
+                       if (*rest)
+                               continue;
+                       p->out[p->cpidx++] = arg - 2;
+                       return 0;
+               }
+               if (!rest) {
+                       /* abbreviated? */
+                       if (!strncmp(options->long_name, arg, arg_end - arg)) {
+is_abbreviated:
+                               if (abbrev_option) {
+                                       /*
+                                        * If this is abbreviated, it is
+                                        * ambiguous. So when there is no
+                                        * exact match later, we need to
+                                        * error out.
+                                        */
+                                       ambiguous_option = abbrev_option;
+                                       ambiguous_flags = abbrev_flags;
+                               }
+                               if (!(flags & OPT_UNSET) && *arg_end)
+                                       p->opt = arg_end + 1;
+                               abbrev_option = options;
+                               abbrev_flags = flags;
+                               continue;
+                       }
+                       /* negated and abbreviated very much? */
+                       if (!prefixcmp("no-", arg)) {
+                               flags |= OPT_UNSET;
+                               goto is_abbreviated;
+                       }
+                       /* negated? */
+                       if (strncmp(arg, "no-", 3))
+                               continue;
+                       flags |= OPT_UNSET;
+                       rest = skip_prefix(arg + 3, options->long_name);
+                       /* abbreviated and negated? */
+                       if (!rest && !prefixcmp(options->long_name, arg + 3))
+                               goto is_abbreviated;
+                       if (!rest)
+                               continue;
+               }
+               if (*rest) {
+                       if (*rest != '=')
+                               continue;
+                       p->opt = rest + 1;
+               }
+               return get_value(p, options, flags);
+       }
+
+       if (ambiguous_option)
+               return error("Ambiguous option: %s "
+                       "(could be --%s%s or --%s%s)",
+                       arg,
+                       (ambiguous_flags & OPT_UNSET) ?  "no-" : "",
+                       ambiguous_option->long_name,
+                       (abbrev_flags & OPT_UNSET) ?  "no-" : "",
+                       abbrev_option->long_name);
+       if (abbrev_option)
+               return get_value(p, abbrev_option, abbrev_flags);
+       return -2;
+}
+
+static void check_typos(const char *arg, const struct option *options)
+{
+       if (strlen(arg) < 3)
+               return;
+
+       if (!prefixcmp(arg, "no-")) {
+               error ("did you mean `--%s` (with two dashes ?)", arg);
+               exit(129);
+       }
+
+       for (; options->type != OPTION_END; options++) {
+               if (!options->long_name)
+                       continue;
+               if (!prefixcmp(options->long_name, arg)) {
+                       error ("did you mean `--%s` (with two dashes ?)", arg);
+                       exit(129);
+               }
+       }
+}
+
+void parse_options_start(struct parse_opt_ctx_t *ctx,
+                        int argc, const char **argv, int flags)
+{
+       memset(ctx, 0, sizeof(*ctx));
+       ctx->argc = argc - 1;
+       ctx->argv = argv + 1;
+       ctx->out  = argv;
+       ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0);
+       ctx->flags = flags;
+       if ((flags & PARSE_OPT_KEEP_UNKNOWN) &&
+           (flags & PARSE_OPT_STOP_AT_NON_OPTION))
+               die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together");
+}
+
+static int usage_with_options_internal(const char * const *,
+                                      const struct option *, int);
+
+int parse_options_step(struct parse_opt_ctx_t *ctx,
+                      const struct option *options,
+                      const char * const usagestr[])
+{
+       int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
+
+       /* we must reset ->opt, unknown short option leave it dangling */
+       ctx->opt = NULL;
+
+       for (; ctx->argc; ctx->argc--, ctx->argv++) {
+               const char *arg = ctx->argv[0];
+
+               if (*arg != '-' || !arg[1]) {
+                       if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION)
+                               break;
+                       ctx->out[ctx->cpidx++] = ctx->argv[0];
+                       continue;
+               }
+
+               if (arg[1] != '-') {
+                       ctx->opt = arg + 1;
+                       if (internal_help && *ctx->opt == 'h')
+                               return parse_options_usage(usagestr, options);
+                       switch (parse_short_opt(ctx, options)) {
+                       case -1:
+                               return parse_options_usage(usagestr, options);
+                       case -2:
+                               goto unknown;
+                       }
+                       if (ctx->opt)
+                               check_typos(arg + 1, options);
+                       while (ctx->opt) {
+                               if (internal_help && *ctx->opt == 'h')
+                                       return parse_options_usage(usagestr, options);
+                               switch (parse_short_opt(ctx, options)) {
+                               case -1:
+                                       return parse_options_usage(usagestr, options);
+                               case -2:
+                                       /* fake a short option thing to hide the fact that we may have
+                                        * started to parse aggregated stuff
+                                        *
+                                        * This is leaky, too bad.
+                                        */
+                                       ctx->argv[0] = strdup(ctx->opt - 1);
+                                       *(char *)ctx->argv[0] = '-';
+                                       goto unknown;
+                               }
+                       }
+                       continue;
+               }
+
+               if (!arg[2]) { /* "--" */
+                       if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) {
+                               ctx->argc--;
+                               ctx->argv++;
+                       }
+                       break;
+               }
+
+               if (internal_help && !strcmp(arg + 2, "help-all"))
+                       return usage_with_options_internal(usagestr, options, 1);
+               if (internal_help && !strcmp(arg + 2, "help"))
+                       return parse_options_usage(usagestr, options);
+               switch (parse_long_opt(ctx, arg + 2, options)) {
+               case -1:
+                       return parse_options_usage(usagestr, options);
+               case -2:
+                       goto unknown;
+               }
+               continue;
+unknown:
+               if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN))
+                       return PARSE_OPT_UNKNOWN;
+               ctx->out[ctx->cpidx++] = ctx->argv[0];
+               ctx->opt = NULL;
+       }
+       return PARSE_OPT_DONE;
+}
+
+int parse_options_end(struct parse_opt_ctx_t *ctx)
+{
+       memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
+       ctx->out[ctx->cpidx + ctx->argc] = NULL;
+       return ctx->cpidx + ctx->argc;
+}
+
+int parse_options(int argc, const char **argv, const struct option *options,
+                 const char * const usagestr[], int flags)
+{
+       struct parse_opt_ctx_t ctx;
+
+       parse_options_start(&ctx, argc, argv, flags);
+       switch (parse_options_step(&ctx, options, usagestr)) {
+       case PARSE_OPT_HELP:
+               exit(129);
+       case PARSE_OPT_DONE:
+               break;
+       default: /* PARSE_OPT_UNKNOWN */
+               if (ctx.argv[0][1] == '-') {
+                       error("unknown option `%s'", ctx.argv[0] + 2);
+               } else {
+                       error("unknown switch `%c'", *ctx.opt);
+               }
+               usage_with_options(usagestr, options);
+       }
+
+       return parse_options_end(&ctx);
+}
+
+#define USAGE_OPTS_WIDTH 24
+#define USAGE_GAP         2
+
+int usage_with_options_internal(const char * const *usagestr,
+                               const struct option *opts, int full)
+{
+       if (!usagestr)
+               return PARSE_OPT_HELP;
+
+       fprintf(stderr, "\n usage: %s\n", *usagestr++);
+       while (*usagestr && **usagestr)
+               fprintf(stderr, "    or: %s\n", *usagestr++);
+       while (*usagestr) {
+               fprintf(stderr, "%s%s\n",
+                               **usagestr ? "    " : "",
+                               *usagestr);
+               usagestr++;
+       }
+
+       if (opts->type != OPTION_GROUP)
+               fputc('\n', stderr);
+
+       for (; opts->type != OPTION_END; opts++) {
+               size_t pos;
+               int pad;
+
+               if (opts->type == OPTION_GROUP) {
+                       fputc('\n', stderr);
+                       if (*opts->help)
+                               fprintf(stderr, "%s\n", opts->help);
+                       continue;
+               }
+               if (!full && (opts->flags & PARSE_OPT_HIDDEN))
+                       continue;
+
+               pos = fprintf(stderr, "    ");
+               if (opts->short_name)
+                       pos += fprintf(stderr, "-%c", opts->short_name);
+               if (opts->long_name && opts->short_name)
+                       pos += fprintf(stderr, ", ");
+               if (opts->long_name)
+                       pos += fprintf(stderr, "--%s", opts->long_name);
+
+               switch (opts->type) {
+               case OPTION_ARGUMENT:
+                       break;
+               case OPTION_INTEGER:
+                       if (opts->flags & PARSE_OPT_OPTARG)
+                               if (opts->long_name)
+                                       pos += fprintf(stderr, "[=<n>]");
+                               else
+                                       pos += fprintf(stderr, "[<n>]");
+                       else
+                               pos += fprintf(stderr, " <n>");
+                       break;
+               case OPTION_CALLBACK:
+                       if (opts->flags & PARSE_OPT_NOARG)
+                               break;
+                       /* FALLTHROUGH */
+               case OPTION_STRING:
+                       if (opts->argh) {
+                               if (opts->flags & PARSE_OPT_OPTARG)
+                                       if (opts->long_name)
+                                               pos += fprintf(stderr, "[=<%s>]", opts->argh);
+                                       else
+                                               pos += fprintf(stderr, "[<%s>]", opts->argh);
+                               else
+                                       pos += fprintf(stderr, " <%s>", opts->argh);
+                       } else {
+                               if (opts->flags & PARSE_OPT_OPTARG)
+                                       if (opts->long_name)
+                                               pos += fprintf(stderr, "[=...]");
+                                       else
+                                               pos += fprintf(stderr, "[...]");
+                               else
+                                       pos += fprintf(stderr, " ...");
+                       }
+                       break;
+               default: /* OPTION_{BIT,BOOLEAN,SET_INT,SET_PTR} */
+                       break;
+               }
+
+               if (pos <= USAGE_OPTS_WIDTH)
+                       pad = USAGE_OPTS_WIDTH - pos;
+               else {
+                       fputc('\n', stderr);
+                       pad = USAGE_OPTS_WIDTH;
+               }
+               fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
+       }
+       fputc('\n', stderr);
+
+       return PARSE_OPT_HELP;
+}
+
+void usage_with_options(const char * const *usagestr,
+                       const struct option *opts)
+{
+       usage_with_options_internal(usagestr, opts, 0);
+       exit(129);
+}
+
+int parse_options_usage(const char * const *usagestr,
+                       const struct option *opts)
+{
+       return usage_with_options_internal(usagestr, opts, 0);
+}
+
+
+int parse_opt_verbosity_cb(const struct option *opt, const char *arg,
+                          int unset)
+{
+       int *target = opt->value;
+
+       if (unset)
+               /* --no-quiet, --no-verbose */
+               *target = 0;
+       else if (opt->short_name == 'v') {
+               if (*target >= 0)
+                       (*target)++;
+               else
+                       *target = 1;
+       } else {
+               if (*target <= 0)
+                       (*target)--;
+               else
+                       *target = -1;
+       }
+       return 0;
+}
diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h

new file mode 100644 (file)

index 0000000..a1039a6
--- /dev/null
+++ b/tools/perf/util/parse-options.h
@@ -0,0 +1,174 @@
+#ifndef PARSE_OPTIONS_H
+#define PARSE_OPTIONS_H
+
+enum parse_opt_type {
+       /* special types */
+       OPTION_END,
+       OPTION_ARGUMENT,
+       OPTION_GROUP,
+       /* options with no arguments */
+       OPTION_BIT,
+       OPTION_BOOLEAN, /* _INCR would have been a better name */
+       OPTION_SET_INT,
+       OPTION_SET_PTR,
+       /* options with arguments (usually) */
+       OPTION_STRING,
+       OPTION_INTEGER,
+       OPTION_LONG,
+       OPTION_CALLBACK,
+};
+
+enum parse_opt_flags {
+       PARSE_OPT_KEEP_DASHDASH = 1,
+       PARSE_OPT_STOP_AT_NON_OPTION = 2,
+       PARSE_OPT_KEEP_ARGV0 = 4,
+       PARSE_OPT_KEEP_UNKNOWN = 8,
+       PARSE_OPT_NO_INTERNAL_HELP = 16,
+};
+
+enum parse_opt_option_flags {
+       PARSE_OPT_OPTARG  = 1,
+       PARSE_OPT_NOARG   = 2,
+       PARSE_OPT_NONEG   = 4,
+       PARSE_OPT_HIDDEN  = 8,
+       PARSE_OPT_LASTARG_DEFAULT = 16,
+};
+
+struct option;
+typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
+
+/*
+ * `type`::
+ *   holds the type of the option, you must have an OPTION_END last in your
+ *   array.
+ *
+ * `short_name`::
+ *   the character to use as a short option name, '\0' if none.
+ *
+ * `long_name`::
+ *   the long option name, without the leading dashes, NULL if none.
+ *
+ * `value`::
+ *   stores pointers to the values to be filled.
+ *
+ * `argh`::
+ *   token to explain the kind of argument this option wants. Keep it
+ *   homogenous across the repository.
+ *
+ * `help`::
+ *   the short help associated to what the option does.
+ *   Must never be NULL (except for OPTION_END).
+ *   OPTION_GROUP uses this pointer to store the group header.
+ *
+ * `flags`::
+ *   mask of parse_opt_option_flags.
+ *   PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs)
+ *   PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs
+ *   PARSE_OPT_NONEG: says that this option cannot be negated
+ *   PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in
+ *                    the long one.
+ *
+ * `callback`::
+ *   pointer to the callback to use for OPTION_CALLBACK.
+ *
+ * `defval`::
+ *   default value to fill (*->value) with for PARSE_OPT_OPTARG.
+ *   OPTION_{BIT,SET_INT,SET_PTR} store the {mask,integer,pointer} to put in
+ *   the value when met.
+ *   CALLBACKS can use it like they want.
+ */
+struct option {
+       enum parse_opt_type type;
+       int short_name;
+       const char *long_name;
+       void *value;
+       const char *argh;
+       const char *help;
+
+       int flags;
+       parse_opt_cb *callback;
+       intptr_t defval;
+};
+
+#define OPT_END()                   { OPTION_END }
+#define OPT_ARGUMENT(l, h)          { OPTION_ARGUMENT, 0, (l), NULL, NULL, (h) }
+#define OPT_GROUP(h)                { OPTION_GROUP, 0, NULL, NULL, NULL, (h) }
+#define OPT_BIT(s, l, v, h, b)      { OPTION_BIT, (s), (l), (v), NULL, (h), 0, NULL, (b) }
+#define OPT_BOOLEAN(s, l, v, h)     { OPTION_BOOLEAN, (s), (l), (v), NULL, (h) }
+#define OPT_SET_INT(s, l, v, h, i)  { OPTION_SET_INT, (s), (l), (v), NULL, (h), 0, NULL, (i) }
+#define OPT_SET_PTR(s, l, v, h, p)  { OPTION_SET_PTR, (s), (l), (v), NULL, (h), 0, NULL, (p) }
+#define OPT_INTEGER(s, l, v, h)     { OPTION_INTEGER, (s), (l), (v), NULL, (h) }
+#define OPT_LONG(s, l, v, h)        { OPTION_LONG, (s), (l), (v), NULL, (h) }
+#define OPT_STRING(s, l, v, a, h)   { OPTION_STRING,  (s), (l), (v), (a), (h) }
+#define OPT_DATE(s, l, v, h) \
+       { OPTION_CALLBACK, (s), (l), (v), "time",(h), 0, \
+         parse_opt_approxidate_cb }
+#define OPT_CALLBACK(s, l, v, a, h, f) \
+       { OPTION_CALLBACK, (s), (l), (v), (a), (h), 0, (f) }
+
+/* parse_options() will filter out the processed options and leave the
+ * non-option argments in argv[].
+ * Returns the number of arguments left in argv[].
+ */
+extern int parse_options(int argc, const char **argv,
+                         const struct option *options,
+                         const char * const usagestr[], int flags);
+
+extern NORETURN void usage_with_options(const char * const *usagestr,
+                                        const struct option *options);
+
+/*----- incremantal advanced APIs -----*/
+
+enum {
+       PARSE_OPT_HELP = -1,
+       PARSE_OPT_DONE,
+       PARSE_OPT_UNKNOWN,
+};
+
+/*
+ * It's okay for the caller to consume argv/argc in the usual way.
+ * Other fields of that structure are private to parse-options and should not
+ * be modified in any way.
+ */
+struct parse_opt_ctx_t {
+       const char **argv;
+       const char **out;
+       int argc, cpidx;
+       const char *opt;
+       int flags;
+};
+
+extern int parse_options_usage(const char * const *usagestr,
+                              const struct option *opts);
+
+extern void parse_options_start(struct parse_opt_ctx_t *ctx,
+                               int argc, const char **argv, int flags);
+
+extern int parse_options_step(struct parse_opt_ctx_t *ctx,
+                             const struct option *options,
+                             const char * const usagestr[]);
+
+extern int parse_options_end(struct parse_opt_ctx_t *ctx);
+
+
+/*----- some often used options -----*/
+extern int parse_opt_abbrev_cb(const struct option *, const char *, int);
+extern int parse_opt_approxidate_cb(const struct option *, const char *, int);
+extern int parse_opt_verbosity_cb(const struct option *, const char *, int);
+
+#define OPT__VERBOSE(var)  OPT_BOOLEAN('v', "verbose", (var), "be verbose")
+#define OPT__QUIET(var)    OPT_BOOLEAN('q', "quiet",   (var), "be quiet")
+#define OPT__VERBOSITY(var) \
+       { OPTION_CALLBACK, 'v', "verbose", (var), NULL, "be more verbose", \
+         PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }, \
+       { OPTION_CALLBACK, 'q', "quiet", (var), NULL, "be more quiet", \
+         PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }
+#define OPT__DRY_RUN(var)  OPT_BOOLEAN('n', "dry-run", (var), "dry run")
+#define OPT__ABBREV(var)  \
+       { OPTION_CALLBACK, 0, "abbrev", (var), "n", \
+         "use <n> digits to display SHA-1s", \
+         PARSE_OPT_OPTARG, &parse_opt_abbrev_cb, 0 }
+
+extern const char *parse_options_fix_filename(const char *prefix, const char *file);
+
+#endif
diff --git a/tools/perf/util/path.c b/tools/perf/util/path.c

new file mode 100644 (file)

index 0000000..a501a40
--- /dev/null
+++ b/tools/perf/util/path.c
@@ -0,0 +1,353 @@
+/*
+ * I'm tired of doing "vsnprintf()" etc just to open a
+ * file, so here's a "return static buffer with printf"
+ * interface for paths.
+ *
+ * It's obviously not thread-safe. Sue me. But it's quite
+ * useful for doing things like
+ *
+ *   f = open(mkpath("%s/%s.perf", base, name), O_RDONLY);
+ *
+ * which is what it's designed for.
+ */
+#include "cache.h"
+
+static char bad_path[] = "/bad-path/";
+/*
+ * Two hacks:
+ */
+
+static char *get_perf_dir(void)
+{
+       return ".";
+}
+
+size_t strlcpy(char *dest, const char *src, size_t size)
+{
+       size_t ret = strlen(src);
+
+       if (size) {
+               size_t len = (ret >= size) ? size - 1 : ret;
+               memcpy(dest, src, len);
+               dest[len] = '\0';
+       }
+       return ret;
+}
+
+
+static char *get_pathname(void)
+{
+       static char pathname_array[4][PATH_MAX];
+       static int index;
+       return pathname_array[3 & ++index];
+}
+
+static char *cleanup_path(char *path)
+{
+       /* Clean it up */
+       if (!memcmp(path, "./", 2)) {
+               path += 2;
+               while (*path == '/')
+                       path++;
+       }
+       return path;
+}
+
+char *mksnpath(char *buf, size_t n, const char *fmt, ...)
+{
+       va_list args;
+       unsigned len;
+
+       va_start(args, fmt);
+       len = vsnprintf(buf, n, fmt, args);
+       va_end(args);
+       if (len >= n) {
+               strlcpy(buf, bad_path, n);
+               return buf;
+       }
+       return cleanup_path(buf);
+}
+
+static char *perf_vsnpath(char *buf, size_t n, const char *fmt, va_list args)
+{
+       const char *perf_dir = get_perf_dir();
+       size_t len;
+
+       len = strlen(perf_dir);
+       if (n < len + 1)
+               goto bad;
+       memcpy(buf, perf_dir, len);
+       if (len && !is_dir_sep(perf_dir[len-1]))
+               buf[len++] = '/';
+       len += vsnprintf(buf + len, n - len, fmt, args);
+       if (len >= n)
+               goto bad;
+       return cleanup_path(buf);
+bad:
+       strlcpy(buf, bad_path, n);
+       return buf;
+}
+
+char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
+{
+       va_list args;
+       va_start(args, fmt);
+       (void)perf_vsnpath(buf, n, fmt, args);
+       va_end(args);
+       return buf;
+}
+
+char *perf_pathdup(const char *fmt, ...)
+{
+       char path[PATH_MAX];
+       va_list args;
+       va_start(args, fmt);
+       (void)perf_vsnpath(path, sizeof(path), fmt, args);
+       va_end(args);
+       return xstrdup(path);
+}
+
+char *mkpath(const char *fmt, ...)
+{
+       va_list args;
+       unsigned len;
+       char *pathname = get_pathname();
+
+       va_start(args, fmt);
+       len = vsnprintf(pathname, PATH_MAX, fmt, args);
+       va_end(args);
+       if (len >= PATH_MAX)
+               return bad_path;
+       return cleanup_path(pathname);
+}
+
+char *perf_path(const char *fmt, ...)
+{
+       const char *perf_dir = get_perf_dir();
+       char *pathname = get_pathname();
+       va_list args;
+       unsigned len;
+
+       len = strlen(perf_dir);
+       if (len > PATH_MAX-100)
+               return bad_path;
+       memcpy(pathname, perf_dir, len);
+       if (len && perf_dir[len-1] != '/')
+               pathname[len++] = '/';
+       va_start(args, fmt);
+       len += vsnprintf(pathname + len, PATH_MAX - len, fmt, args);
+       va_end(args);
+       if (len >= PATH_MAX)
+               return bad_path;
+       return cleanup_path(pathname);
+}
+
+
+/* perf_mkstemp() - create tmp file honoring TMPDIR variable */
+int perf_mkstemp(char *path, size_t len, const char *template)
+{
+       const char *tmp;
+       size_t n;
+
+       tmp = getenv("TMPDIR");
+       if (!tmp)
+               tmp = "/tmp";
+       n = snprintf(path, len, "%s/%s", tmp, template);
+       if (len <= n) {
+               errno = ENAMETOOLONG;
+               return -1;
+       }
+       return mkstemp(path);
+}
+
+
+const char *make_relative_path(const char *abs, const char *base)
+{
+       static char buf[PATH_MAX + 1];
+       int baselen;
+       if (!base)
+               return abs;
+       baselen = strlen(base);
+       if (prefixcmp(abs, base))
+               return abs;
+       if (abs[baselen] == '/')
+               baselen++;
+       else if (base[baselen - 1] != '/')
+               return abs;
+       strcpy(buf, abs + baselen);
+       return buf;
+}
+
+/*
+ * It is okay if dst == src, but they should not overlap otherwise.
+ *
+ * Performs the following normalizations on src, storing the result in dst:
+ * - Ensures that components are separated by '/' (Windows only)
+ * - Squashes sequences of '/'.
+ * - Removes "." components.
+ * - Removes ".." components, and the components the precede them.
+ * Returns failure (non-zero) if a ".." component appears as first path
+ * component anytime during the normalization. Otherwise, returns success (0).
+ *
+ * Note that this function is purely textual.  It does not follow symlinks,
+ * verify the existence of the path, or make any system calls.
+ */
+int normalize_path_copy(char *dst, const char *src)
+{
+       char *dst0;
+
+       if (has_dos_drive_prefix(src)) {
+               *dst++ = *src++;
+               *dst++ = *src++;
+       }
+       dst0 = dst;
+
+       if (is_dir_sep(*src)) {
+               *dst++ = '/';
+               while (is_dir_sep(*src))
+                       src++;
+       }
+
+       for (;;) {
+               char c = *src;
+
+               /*
+                * A path component that begins with . could be
+                * special:
+                * (1) "." and ends   -- ignore and terminate.
+                * (2) "./"           -- ignore them, eat slash and continue.
+                * (3) ".." and ends  -- strip one and terminate.
+                * (4) "../"          -- strip one, eat slash and continue.
+                */
+               if (c == '.') {
+                       if (!src[1]) {
+                               /* (1) */
+                               src++;
+                       } else if (is_dir_sep(src[1])) {
+                               /* (2) */
+                               src += 2;
+                               while (is_dir_sep(*src))
+                                       src++;
+                               continue;
+                       } else if (src[1] == '.') {
+                               if (!src[2]) {
+                                       /* (3) */
+                                       src += 2;
+                                       goto up_one;
+                               } else if (is_dir_sep(src[2])) {
+                                       /* (4) */
+                                       src += 3;
+                                       while (is_dir_sep(*src))
+                                               src++;
+                                       goto up_one;
+                               }
+                       }
+               }
+
+               /* copy up to the next '/', and eat all '/' */
+               while ((c = *src++) != '\0' && !is_dir_sep(c))
+                       *dst++ = c;
+               if (is_dir_sep(c)) {
+                       *dst++ = '/';
+                       while (is_dir_sep(c))
+                               c = *src++;
+                       src--;
+               } else if (!c)
+                       break;
+               continue;
+
+       up_one:
+               /*
+                * dst0..dst is prefix portion, and dst[-1] is '/';
+                * go up one level.
+                */
+               dst--;  /* go to trailing '/' */
+               if (dst <= dst0)
+                       return -1;
+               /* Windows: dst[-1] cannot be backslash anymore */
+               while (dst0 < dst && dst[-1] != '/')
+                       dst--;
+       }
+       *dst = '\0';
+       return 0;
+}
+
+/*
+ * path = Canonical absolute path
+ * prefix_list = Colon-separated list of absolute paths
+ *
+ * Determines, for each path in prefix_list, whether the "prefix" really
+ * is an ancestor directory of path.  Returns the length of the longest
+ * ancestor directory, excluding any trailing slashes, or -1 if no prefix
+ * is an ancestor.  (Note that this means 0 is returned if prefix_list is
+ * "/".) "/foo" is not considered an ancestor of "/foobar".  Directories
+ * are not considered to be their own ancestors.  path must be in a
+ * canonical form: empty components, or "." or ".." components are not
+ * allowed.  prefix_list may be null, which is like "".
+ */
+int longest_ancestor_length(const char *path, const char *prefix_list)
+{
+       char buf[PATH_MAX+1];
+       const char *ceil, *colon;
+       int len, max_len = -1;
+
+       if (prefix_list == NULL || !strcmp(path, "/"))
+               return -1;
+
+       for (colon = ceil = prefix_list; *colon; ceil = colon+1) {
+               for (colon = ceil; *colon && *colon != PATH_SEP; colon++);
+               len = colon - ceil;
+               if (len == 0 || len > PATH_MAX || !is_absolute_path(ceil))
+                       continue;
+               strlcpy(buf, ceil, len+1);
+               if (normalize_path_copy(buf, buf) < 0)
+                       continue;
+               len = strlen(buf);
+               if (len > 0 && buf[len-1] == '/')
+                       buf[--len] = '\0';
+
+               if (!strncmp(path, buf, len) &&
+                   path[len] == '/' &&
+                   len > max_len) {
+                       max_len = len;
+               }
+       }
+
+       return max_len;
+}
+
+/* strip arbitrary amount of directory separators at end of path */
+static inline int chomp_trailing_dir_sep(const char *path, int len)
+{
+       while (len && is_dir_sep(path[len - 1]))
+               len--;
+       return len;
+}
+
+/*
+ * If path ends with suffix (complete path components), returns the
+ * part before suffix (sans trailing directory separators).
+ * Otherwise returns NULL.
+ */
+char *strip_path_suffix(const char *path, const char *suffix)
+{
+       int path_len = strlen(path), suffix_len = strlen(suffix);
+
+       while (suffix_len) {
+               if (!path_len)
+                       return NULL;
+
+               if (is_dir_sep(path[path_len - 1])) {
+                       if (!is_dir_sep(suffix[suffix_len - 1]))
+                               return NULL;
+                       path_len = chomp_trailing_dir_sep(path, path_len);
+                       suffix_len = chomp_trailing_dir_sep(suffix, suffix_len);
+               }
+               else if (path[--path_len] != suffix[--suffix_len])
+                       return NULL;
+       }
+
+       if (path_len && !is_dir_sep(path[path_len - 1]))
+               return NULL;
+       return xstrndup(path, chomp_trailing_dir_sep(path, path_len));
+}
diff --git a/tools/perf/util/quote.c b/tools/perf/util/quote.c

new file mode 100644 (file)

index 0000000..f18c521
--- /dev/null
+++ b/tools/perf/util/quote.c
@@ -0,0 +1,481 @@
+#include "cache.h"
+#include "quote.h"
+
+int quote_path_fully = 1;
+
+/* Help to copy the thing properly quoted for the shell safety.
+ * any single quote is replaced with '\'', any exclamation point
+ * is replaced with '\!', and the whole thing is enclosed in a
+ *
+ * E.g.
+ *  original     sq_quote     result
+ *  name     ==> name      ==> 'name'
+ *  a b      ==> a b       ==> 'a b'
+ *  a'b      ==> a'\''b    ==> 'a'\''b'
+ *  a!b      ==> a'\!'b    ==> 'a'\!'b'
+ */
+static inline int need_bs_quote(char c)
+{
+       return (c == '\'' || c == '!');
+}
+
+void sq_quote_buf(struct strbuf *dst, const char *src)
+{
+       char *to_free = NULL;
+
+       if (dst->buf == src)
+               to_free = strbuf_detach(dst, NULL);
+
+       strbuf_addch(dst, '\'');
+       while (*src) {
+               size_t len = strcspn(src, "'!");
+               strbuf_add(dst, src, len);
+               src += len;
+               while (need_bs_quote(*src)) {
+                       strbuf_addstr(dst, "'\\");
+                       strbuf_addch(dst, *src++);
+                       strbuf_addch(dst, '\'');
+               }
+       }
+       strbuf_addch(dst, '\'');
+       free(to_free);
+}
+
+void sq_quote_print(FILE *stream, const char *src)
+{
+       char c;
+
+       fputc('\'', stream);
+       while ((c = *src++)) {
+               if (need_bs_quote(c)) {
+                       fputs("'\\", stream);
+                       fputc(c, stream);
+                       fputc('\'', stream);
+               } else {
+                       fputc(c, stream);
+               }
+       }
+       fputc('\'', stream);
+}
+
+void sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen)
+{
+       int i;
+
+       /* Copy into destination buffer. */
+       strbuf_grow(dst, 255);
+       for (i = 0; argv[i]; ++i) {
+               strbuf_addch(dst, ' ');
+               sq_quote_buf(dst, argv[i]);
+               if (maxlen && dst->len > maxlen)
+                       die("Too many or long arguments");
+       }
+}
+
+char *sq_dequote_step(char *arg, char **next)
+{
+       char *dst = arg;
+       char *src = arg;
+       char c;
+
+       if (*src != '\'')
+               return NULL;
+       for (;;) {
+               c = *++src;
+               if (!c)
+                       return NULL;
+               if (c != '\'') {
+                       *dst++ = c;
+                       continue;
+               }
+               /* We stepped out of sq */
+               switch (*++src) {
+               case '\0':
+                       *dst = 0;
+                       if (next)
+                               *next = NULL;
+                       return arg;
+               case '\\':
+                       c = *++src;
+                       if (need_bs_quote(c) && *++src == '\'') {
+                               *dst++ = c;
+                               continue;
+                       }
+               /* Fallthrough */
+               default:
+                       if (!next || !isspace(*src))
+                               return NULL;
+                       do {
+                               c = *++src;
+                       } while (isspace(c));
+                       *dst = 0;
+                       *next = src;
+                       return arg;
+               }
+       }
+}
+
+char *sq_dequote(char *arg)
+{
+       return sq_dequote_step(arg, NULL);
+}
+
+int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc)
+{
+       char *next = arg;
+
+       if (!*arg)
+               return 0;
+       do {
+               char *dequoted = sq_dequote_step(next, &next);
+               if (!dequoted)
+                       return -1;
+               ALLOC_GROW(*argv, *nr + 1, *alloc);
+               (*argv)[(*nr)++] = dequoted;
+       } while (next);
+
+       return 0;
+}
+
+/* 1 means: quote as octal
+ * 0 means: quote as octal if (quote_path_fully)
+ * -1 means: never quote
+ * c: quote as "\\c"
+ */
+#define X8(x)   x, x, x, x, x, x, x, x
+#define X16(x)  X8(x), X8(x)
+static signed char const sq_lookup[256] = {
+       /*           0    1    2    3    4    5    6    7 */
+       /* 0x00 */   1,   1,   1,   1,   1,   1,   1, 'a',
+       /* 0x08 */ 'b', 't', 'n', 'v', 'f', 'r',   1,   1,
+       /* 0x10 */ X16(1),
+       /* 0x20 */  -1,  -1, '"',  -1,  -1,  -1,  -1,  -1,
+       /* 0x28 */ X16(-1), X16(-1), X16(-1),
+       /* 0x58 */  -1,  -1,  -1,  -1,'\\',  -1,  -1,  -1,
+       /* 0x60 */ X16(-1), X8(-1),
+       /* 0x78 */  -1,  -1,  -1,  -1,  -1,  -1,  -1,   1,
+       /* 0x80 */ /* set to 0 */
+};
+
+static inline int sq_must_quote(char c)
+{
+       return sq_lookup[(unsigned char)c] + quote_path_fully > 0;
+}
+
+/* returns the longest prefix not needing a quote up to maxlen if positive.
+   This stops at the first \0 because it's marked as a character needing an
+   escape */
+static size_t next_quote_pos(const char *s, ssize_t maxlen)
+{
+       size_t len;
+       if (maxlen < 0) {
+               for (len = 0; !sq_must_quote(s[len]); len++);
+       } else {
+               for (len = 0; len < maxlen && !sq_must_quote(s[len]); len++);
+       }
+       return len;
+}
+
+/*
+ * C-style name quoting.
+ *
+ * (1) if sb and fp are both NULL, inspect the input name and counts the
+ *     number of bytes that are needed to hold c_style quoted version of name,
+ *     counting the double quotes around it but not terminating NUL, and
+ *     returns it.
+ *     However, if name does not need c_style quoting, it returns 0.
+ *
+ * (2) if sb or fp are not NULL, it emits the c_style quoted version
+ *     of name, enclosed with double quotes if asked and needed only.
+ *     Return value is the same as in (1).
+ */
+static size_t quote_c_style_counted(const char *name, ssize_t maxlen,
+                                    struct strbuf *sb, FILE *fp, int no_dq)
+{
+#undef EMIT
+#define EMIT(c)                                 \
+       do {                                        \
+               if (sb) strbuf_addch(sb, (c));          \
+               if (fp) fputc((c), fp);                 \
+               count++;                                \
+       } while (0)
+#define EMITBUF(s, l)                           \
+       do {                                        \
+               int __ret;                              \
+               if (sb) strbuf_add(sb, (s), (l));       \
+               if (fp) __ret = fwrite((s), (l), 1, fp);        \
+               count += (l);                           \
+       } while (0)
+
+       size_t len, count = 0;
+       const char *p = name;
+
+       for (;;) {
+               int ch;
+
+               len = next_quote_pos(p, maxlen);
+               if (len == maxlen || !p[len])
+                       break;
+
+               if (!no_dq && p == name)
+                       EMIT('"');
+
+               EMITBUF(p, len);
+               EMIT('\\');
+               p += len;
+               ch = (unsigned char)*p++;
+               if (sq_lookup[ch] >= ' ') {
+                       EMIT(sq_lookup[ch]);
+               } else {
+                       EMIT(((ch >> 6) & 03) + '0');
+                       EMIT(((ch >> 3) & 07) + '0');
+                       EMIT(((ch >> 0) & 07) + '0');
+               }
+       }
+
+       EMITBUF(p, len);
+       if (p == name)   /* no ending quote needed */
+               return 0;
+
+       if (!no_dq)
+               EMIT('"');
+       return count;
+}
+
+size_t quote_c_style(const char *name, struct strbuf *sb, FILE *fp, int nodq)
+{
+       return quote_c_style_counted(name, -1, sb, fp, nodq);
+}
+
+void quote_two_c_style(struct strbuf *sb, const char *prefix, const char *path, int nodq)
+{
+       if (quote_c_style(prefix, NULL, NULL, 0) ||
+           quote_c_style(path, NULL, NULL, 0)) {
+               if (!nodq)
+                       strbuf_addch(sb, '"');
+               quote_c_style(prefix, sb, NULL, 1);
+               quote_c_style(path, sb, NULL, 1);
+               if (!nodq)
+                       strbuf_addch(sb, '"');
+       } else {
+               strbuf_addstr(sb, prefix);
+               strbuf_addstr(sb, path);
+       }
+}
+
+void write_name_quoted(const char *name, FILE *fp, int terminator)
+{
+       if (terminator) {
+               quote_c_style(name, NULL, fp, 0);
+       } else {
+               fputs(name, fp);
+       }
+       fputc(terminator, fp);
+}
+
+extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
+                                 const char *name, FILE *fp, int terminator)
+{
+       int needquote = 0;
+
+       if (terminator) {
+               needquote = next_quote_pos(pfx, pfxlen) < pfxlen
+                       || name[next_quote_pos(name, -1)];
+       }
+       if (needquote) {
+               fputc('"', fp);
+               quote_c_style_counted(pfx, pfxlen, NULL, fp, 1);
+               quote_c_style(name, NULL, fp, 1);
+               fputc('"', fp);
+       } else {
+               int ret;
+
+               ret = fwrite(pfx, pfxlen, 1, fp);
+               fputs(name, fp);
+       }
+       fputc(terminator, fp);
+}
+
+/* quote path as relative to the given prefix */
+char *quote_path_relative(const char *in, int len,
+                         struct strbuf *out, const char *prefix)
+{
+       int needquote;
+
+       if (len < 0)
+               len = strlen(in);
+
+       /* "../" prefix itself does not need quoting, but "in" might. */
+       needquote = next_quote_pos(in, len) < len;
+       strbuf_setlen(out, 0);
+       strbuf_grow(out, len);
+
+       if (needquote)
+               strbuf_addch(out, '"');
+       if (prefix) {
+               int off = 0;
+               while (prefix[off] && off < len && prefix[off] == in[off])
+                       if (prefix[off] == '/') {
+                               prefix += off + 1;
+                               in += off + 1;
+                               len -= off + 1;
+                               off = 0;
+                       } else
+                               off++;
+
+               for (; *prefix; prefix++)
+                       if (*prefix == '/')
+                               strbuf_addstr(out, "../");
+       }
+
+       quote_c_style_counted (in, len, out, NULL, 1);
+
+       if (needquote)
+               strbuf_addch(out, '"');
+       if (!out->len)
+               strbuf_addstr(out, "./");
+
+       return out->buf;
+}
+
+/*
+ * C-style name unquoting.
+ *
+ * Quoted should point at the opening double quote.
+ * + Returns 0 if it was able to unquote the string properly, and appends the
+ *   result in the strbuf `sb'.
+ * + Returns -1 in case of error, and doesn't touch the strbuf. Though note
+ *   that this function will allocate memory in the strbuf, so calling
+ *   strbuf_release is mandatory whichever result unquote_c_style returns.
+ *
+ * Updates endp pointer to point at one past the ending double quote if given.
+ */
+int unquote_c_style(struct strbuf *sb, const char *quoted, const char **endp)
+{
+       size_t oldlen = sb->len, len;
+       int ch, ac;
+
+       if (*quoted++ != '"')
+               return -1;
+
+       for (;;) {
+               len = strcspn(quoted, "\"\\");
+               strbuf_add(sb, quoted, len);
+               quoted += len;
+
+               switch (*quoted++) {
+                 case '"':
+                       if (endp)
+                               *endp = quoted;
+                       return 0;
+                 case '\\':
+                       break;
+                 default:
+                       goto error;
+               }
+
+               switch ((ch = *quoted++)) {
+               case 'a': ch = '\a'; break;
+               case 'b': ch = '\b'; break;
+               case 'f': ch = '\f'; break;
+               case 'n': ch = '\n'; break;
+               case 'r': ch = '\r'; break;
+               case 't': ch = '\t'; break;
+               case 'v': ch = '\v'; break;
+
+               case '\\': case '"':
+                       break; /* verbatim */
+
+               /* octal values with first digit over 4 overflow */
+               case '0': case '1': case '2': case '3':
+                                       ac = ((ch - '0') << 6);
+                       if ((ch = *quoted++) < '0' || '7' < ch)
+                               goto error;
+                                       ac |= ((ch - '0') << 3);
+                       if ((ch = *quoted++) < '0' || '7' < ch)
+                               goto error;
+                                       ac |= (ch - '0');
+                                       ch = ac;
+                                       break;
+                               default:
+                       goto error;
+                       }
+               strbuf_addch(sb, ch);
+               }
+
+  error:
+       strbuf_setlen(sb, oldlen);
+       return -1;
+}
+
+/* quoting as a string literal for other languages */
+
+void perl_quote_print(FILE *stream, const char *src)
+{
+       const char sq = '\'';
+       const char bq = '\\';
+       char c;
+
+       fputc(sq, stream);
+       while ((c = *src++)) {
+               if (c == sq || c == bq)
+                       fputc(bq, stream);
+               fputc(c, stream);
+       }
+       fputc(sq, stream);
+}
+
+void python_quote_print(FILE *stream, const char *src)
+{
+       const char sq = '\'';
+       const char bq = '\\';
+       const char nl = '\n';
+       char c;
+
+       fputc(sq, stream);
+       while ((c = *src++)) {
+               if (c == nl) {
+                       fputc(bq, stream);
+                       fputc('n', stream);
+                       continue;
+               }
+               if (c == sq || c == bq)
+                       fputc(bq, stream);
+               fputc(c, stream);
+       }
+       fputc(sq, stream);
+}
+
+void tcl_quote_print(FILE *stream, const char *src)
+{
+       char c;
+
+       fputc('"', stream);
+       while ((c = *src++)) {
+               switch (c) {
+               case '[': case ']':
+               case '{': case '}':
+               case '$': case '\\': case '"':
+                       fputc('\\', stream);
+               default:
+                       fputc(c, stream);
+                       break;
+               case '\f':
+                       fputs("\\f", stream);
+                       break;
+               case '\r':
+                       fputs("\\r", stream);
+                       break;
+               case '\n':
+                       fputs("\\n", stream);
+                       break;
+               case '\t':
+                       fputs("\\t", stream);
+                       break;
+               case '\v':
+                       fputs("\\v", stream);
+                       break;
+               }
+       }
+       fputc('"', stream);
+}
diff --git a/tools/perf/util/quote.h b/tools/perf/util/quote.h

new file mode 100644 (file)

index 0000000..5dfad89
--- /dev/null
+++ b/tools/perf/util/quote.h
@@ -0,0 +1,68 @@
+#ifndef QUOTE_H
+#define QUOTE_H
+
+#include <stddef.h>
+#include <stdio.h>
+
+/* Help to copy the thing properly quoted for the shell safety.
+ * any single quote is replaced with '\'', any exclamation point
+ * is replaced with '\!', and the whole thing is enclosed in a
+ * single quote pair.
+ *
+ * For example, if you are passing the result to system() as an
+ * argument:
+ *
+ * sprintf(cmd, "foobar %s %s", sq_quote(arg0), sq_quote(arg1))
+ *
+ * would be appropriate.  If the system() is going to call ssh to
+ * run the command on the other side:
+ *
+ * sprintf(cmd, "git-diff-tree %s %s", sq_quote(arg0), sq_quote(arg1));
+ * sprintf(rcmd, "ssh %s %s", sq_util/quote.host), sq_quote(cmd));
+ *
+ * Note that the above examples leak memory!  Remember to free result from
+ * sq_quote() in a real application.
+ *
+ * sq_quote_buf() writes to an existing buffer of specified size; it
+ * will return the number of characters that would have been written
+ * excluding the final null regardless of the buffer size.
+ */
+
+extern void sq_quote_print(FILE *stream, const char *src);
+
+extern void sq_quote_buf(struct strbuf *, const char *src);
+extern void sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen);
+
+/* This unwraps what sq_quote() produces in place, but returns
+ * NULL if the input does not look like what sq_quote would have
+ * produced.
+ */
+extern char *sq_dequote(char *);
+
+/*
+ * Same as the above, but can be used to unwrap many arguments in the
+ * same string separated by space. "next" is changed to point to the
+ * next argument that should be passed as first parameter. When there
+ * is no more argument to be dequoted, "next" is updated to point to NULL.
+ */
+extern char *sq_dequote_step(char *arg, char **next);
+extern int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc);
+
+extern int unquote_c_style(struct strbuf *, const char *quoted, const char **endp);
+extern size_t quote_c_style(const char *name, struct strbuf *, FILE *, int no_dq);
+extern void quote_two_c_style(struct strbuf *, const char *, const char *, int);
+
+extern void write_name_quoted(const char *name, FILE *, int terminator);
+extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
+                                 const char *name, FILE *, int terminator);
+
+/* quote path as relative to the given prefix */
+char *quote_path_relative(const char *in, int len,
+                         struct strbuf *out, const char *prefix);
+
+/* quoting as a string literal for other languages */
+extern void perl_quote_print(FILE *stream, const char *src);
+extern void python_quote_print(FILE *stream, const char *src);
+extern void tcl_quote_print(FILE *stream, const char *src);
+
+#endif
diff --git a/tools/perf/util/rbtree.c b/tools/perf/util/rbtree.c

new file mode 100644 (file)

index 0000000..b15ba9c
--- /dev/null
+++ b/tools/perf/util/rbtree.c
@@ -0,0 +1,383 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  (C) 2002  David Woodhouse <dwmw2@infradead.org>
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/lib/rbtree.c
+*/
+
+#include "rbtree.h"
+
+static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
+{
+       struct rb_node *right = node->rb_right;
+       struct rb_node *parent = rb_parent(node);
+
+       if ((node->rb_right = right->rb_left))
+               rb_set_parent(right->rb_left, node);
+       right->rb_left = node;
+
+       rb_set_parent(right, parent);
+
+       if (parent)
+       {
+               if (node == parent->rb_left)
+                       parent->rb_left = right;
+               else
+                       parent->rb_right = right;
+       }
+       else
+               root->rb_node = right;
+       rb_set_parent(node, right);
+}
+
+static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
+{
+       struct rb_node *left = node->rb_left;
+       struct rb_node *parent = rb_parent(node);
+
+       if ((node->rb_left = left->rb_right))
+               rb_set_parent(left->rb_right, node);
+       left->rb_right = node;
+
+       rb_set_parent(left, parent);
+
+       if (parent)
+       {
+               if (node == parent->rb_right)
+                       parent->rb_right = left;
+               else
+                       parent->rb_left = left;
+       }
+       else
+               root->rb_node = left;
+       rb_set_parent(node, left);
+}
+
+void rb_insert_color(struct rb_node *node, struct rb_root *root)
+{
+       struct rb_node *parent, *gparent;
+
+       while ((parent = rb_parent(node)) && rb_is_red(parent))
+       {
+               gparent = rb_parent(parent);
+
+               if (parent == gparent->rb_left)
+               {
+                       {
+                               register struct rb_node *uncle = gparent->rb_right;
+                               if (uncle && rb_is_red(uncle))
+                               {
+                                       rb_set_black(uncle);
+                                       rb_set_black(parent);
+                                       rb_set_red(gparent);
+                                       node = gparent;
+                                       continue;
+                               }
+                       }
+
+                       if (parent->rb_right == node)
+                       {
+                               register struct rb_node *tmp;
+                               __rb_rotate_left(parent, root);
+                               tmp = parent;
+                               parent = node;
+                               node = tmp;
+                       }
+
+                       rb_set_black(parent);
+                       rb_set_red(gparent);
+                       __rb_rotate_right(gparent, root);
+               } else {
+                       {
+                               register struct rb_node *uncle = gparent->rb_left;
+                               if (uncle && rb_is_red(uncle))
+                               {
+                                       rb_set_black(uncle);
+                                       rb_set_black(parent);
+                                       rb_set_red(gparent);
+                                       node = gparent;
+                                       continue;
+                               }
+                       }
+
+                       if (parent->rb_left == node)
+                       {
+                               register struct rb_node *tmp;
+                               __rb_rotate_right(parent, root);
+                               tmp = parent;
+                               parent = node;
+                               node = tmp;
+                       }
+
+                       rb_set_black(parent);
+                       rb_set_red(gparent);
+                       __rb_rotate_left(gparent, root);
+               }
+       }
+
+       rb_set_black(root->rb_node);
+}
+
+static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
+                            struct rb_root *root)
+{
+       struct rb_node *other;
+
+       while ((!node || rb_is_black(node)) && node != root->rb_node)
+       {
+               if (parent->rb_left == node)
+               {
+                       other = parent->rb_right;
+                       if (rb_is_red(other))
+                       {
+                               rb_set_black(other);
+                               rb_set_red(parent);
+                               __rb_rotate_left(parent, root);
+                               other = parent->rb_right;
+                       }
+                       if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+                           (!other->rb_right || rb_is_black(other->rb_right)))
+                       {
+                               rb_set_red(other);
+                               node = parent;
+                               parent = rb_parent(node);
+                       }
+                       else
+                       {
+                               if (!other->rb_right || rb_is_black(other->rb_right))
+                               {
+                                       rb_set_black(other->rb_left);
+                                       rb_set_red(other);
+                                       __rb_rotate_right(other, root);
+                                       other = parent->rb_right;
+                               }
+                               rb_set_color(other, rb_color(parent));
+                               rb_set_black(parent);
+                               rb_set_black(other->rb_right);
+                               __rb_rotate_left(parent, root);
+                               node = root->rb_node;
+                               break;
+                       }
+               }
+               else
+               {
+                       other = parent->rb_left;
+                       if (rb_is_red(other))
+                       {
+                               rb_set_black(other);
+                               rb_set_red(parent);
+                               __rb_rotate_right(parent, root);
+                               other = parent->rb_left;
+                       }
+                       if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+                           (!other->rb_right || rb_is_black(other->rb_right)))
+                       {
+                               rb_set_red(other);
+                               node = parent;
+                               parent = rb_parent(node);
+                       }
+                       else
+                       {
+                               if (!other->rb_left || rb_is_black(other->rb_left))
+                               {
+                                       rb_set_black(other->rb_right);
+                                       rb_set_red(other);
+                                       __rb_rotate_left(other, root);
+                                       other = parent->rb_left;
+                               }
+                               rb_set_color(other, rb_color(parent));
+                               rb_set_black(parent);
+                               rb_set_black(other->rb_left);
+                               __rb_rotate_right(parent, root);
+                               node = root->rb_node;
+                               break;
+                       }
+               }
+       }
+       if (node)
+               rb_set_black(node);
+}
+
+void rb_erase(struct rb_node *node, struct rb_root *root)
+{
+       struct rb_node *child, *parent;
+       int color;
+
+       if (!node->rb_left)
+               child = node->rb_right;
+       else if (!node->rb_right)
+               child = node->rb_left;
+       else
+       {
+               struct rb_node *old = node, *left;
+
+               node = node->rb_right;
+               while ((left = node->rb_left) != NULL)
+                       node = left;
+               child = node->rb_right;
+               parent = rb_parent(node);
+               color = rb_color(node);
+
+               if (child)
+                       rb_set_parent(child, parent);
+               if (parent == old) {
+                       parent->rb_right = child;
+                       parent = node;
+               } else
+                       parent->rb_left = child;
+
+               node->rb_parent_color = old->rb_parent_color;
+               node->rb_right = old->rb_right;
+               node->rb_left = old->rb_left;
+
+               if (rb_parent(old))
+               {
+                       if (rb_parent(old)->rb_left == old)
+                               rb_parent(old)->rb_left = node;
+                       else
+                               rb_parent(old)->rb_right = node;
+               } else
+                       root->rb_node = node;
+
+               rb_set_parent(old->rb_left, node);
+               if (old->rb_right)
+                       rb_set_parent(old->rb_right, node);
+               goto color;
+       }
+
+       parent = rb_parent(node);
+       color = rb_color(node);
+
+       if (child)
+               rb_set_parent(child, parent);
+       if (parent)
+       {
+               if (parent->rb_left == node)
+                       parent->rb_left = child;
+               else
+                       parent->rb_right = child;
+       }
+       else
+               root->rb_node = child;
+
+ color:
+       if (color == RB_BLACK)
+               __rb_erase_color(child, parent, root);
+}
+
+/*
+ * This function returns the first node (in sort order) of the tree.
+ */
+struct rb_node *rb_first(const struct rb_root *root)
+{
+       struct rb_node  *n;
+
+       n = root->rb_node;
+       if (!n)
+               return NULL;
+       while (n->rb_left)
+               n = n->rb_left;
+       return n;
+}
+
+struct rb_node *rb_last(const struct rb_root *root)
+{
+       struct rb_node  *n;
+
+       n = root->rb_node;
+       if (!n)
+               return NULL;
+       while (n->rb_right)
+               n = n->rb_right;
+       return n;
+}
+
+struct rb_node *rb_next(const struct rb_node *node)
+{
+       struct rb_node *parent;
+
+       if (rb_parent(node) == node)
+               return NULL;
+
+       /* If we have a right-hand child, go down and then left as far
+          as we can. */
+       if (node->rb_right) {
+               node = node->rb_right; 
+               while (node->rb_left)
+                       node=node->rb_left;
+               return (struct rb_node *)node;
+       }
+
+       /* No right-hand children.  Everything down and left is
+          smaller than us, so any 'next' node must be in the general
+          direction of our parent. Go up the tree; any time the
+          ancestor is a right-hand child of its parent, keep going
+          up. First time it's a left-hand child of its parent, said
+          parent is our 'next' node. */
+       while ((parent = rb_parent(node)) && node == parent->rb_right)
+               node = parent;
+
+       return parent;
+}
+
+struct rb_node *rb_prev(const struct rb_node *node)
+{
+       struct rb_node *parent;
+
+       if (rb_parent(node) == node)
+               return NULL;
+
+       /* If we have a left-hand child, go down and then right as far
+          as we can. */
+       if (node->rb_left) {
+               node = node->rb_left; 
+               while (node->rb_right)
+                       node=node->rb_right;
+               return (struct rb_node *)node;
+       }
+
+       /* No left-hand children. Go up till we find an ancestor which
+          is a right-hand child of its parent */
+       while ((parent = rb_parent(node)) && node == parent->rb_left)
+               node = parent;
+
+       return parent;
+}
+
+void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+                    struct rb_root *root)
+{
+       struct rb_node *parent = rb_parent(victim);
+
+       /* Set the surrounding nodes to point to the replacement */
+       if (parent) {
+               if (victim == parent->rb_left)
+                       parent->rb_left = new;
+               else
+                       parent->rb_right = new;
+       } else {
+               root->rb_node = new;
+       }
+       if (victim->rb_left)
+               rb_set_parent(victim->rb_left, new);
+       if (victim->rb_right)
+               rb_set_parent(victim->rb_right, new);
+
+       /* Copy the pointers/colour from the victim to the replacement */
+       *new = *victim;
+}
diff --git a/tools/perf/util/rbtree.h b/tools/perf/util/rbtree.h

new file mode 100644 (file)

index 0000000..6bdc488
--- /dev/null
+++ b/tools/perf/util/rbtree.h
@@ -0,0 +1,171 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/include/linux/rbtree.h
+
+  To use rbtrees you'll have to implement your own insert and search cores.
+  This will avoid us to use callbacks and to drop drammatically performances.
+  I know it's not the cleaner way,  but in C (not in C++) to get
+  performances and genericity...
+
+  Some example of insert and search follows here. The search is a plain
+  normal search over an ordered tree. The insert instead must be implemented
+  int two steps: as first thing the code must insert the element in
+  order as a red leaf in the tree, then the support library function
+  rb_insert_color() must be called. Such function will do the
+  not trivial work to rebalance the rbtree if necessary.
+
+-----------------------------------------------------------------------
+static inline struct page * rb_search_page_cache(struct inode * inode,
+                                                unsigned long offset)
+{
+       struct rb_node * n = inode->i_rb_page_cache.rb_node;
+       struct page * page;
+
+       while (n)
+       {
+               page = rb_entry(n, struct page, rb_page_cache);
+
+               if (offset < page->offset)
+                       n = n->rb_left;
+               else if (offset > page->offset)
+                       n = n->rb_right;
+               else
+                       return page;
+       }
+       return NULL;
+}
+
+static inline struct page * __rb_insert_page_cache(struct inode * inode,
+                                                  unsigned long offset,
+                                                  struct rb_node * node)
+{
+       struct rb_node ** p = &inode->i_rb_page_cache.rb_node;
+       struct rb_node * parent = NULL;
+       struct page * page;
+
+       while (*p)
+       {
+               parent = *p;
+               page = rb_entry(parent, struct page, rb_page_cache);
+
+               if (offset < page->offset)
+                       p = &(*p)->rb_left;
+               else if (offset > page->offset)
+                       p = &(*p)->rb_right;
+               else
+                       return page;
+       }
+
+       rb_link_node(node, parent, p);
+
+       return NULL;
+}
+
+static inline struct page * rb_insert_page_cache(struct inode * inode,
+                                                unsigned long offset,
+                                                struct rb_node * node)
+{
+       struct page * ret;
+       if ((ret = __rb_insert_page_cache(inode, offset, node)))
+               goto out;
+       rb_insert_color(node, &inode->i_rb_page_cache);
+ out:
+       return ret;
+}
+-----------------------------------------------------------------------
+*/
+
+#ifndef        _LINUX_RBTREE_H
+#define        _LINUX_RBTREE_H
+
+#include <stddef.h>
+
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:       the pointer to the member.
+ * @type:      the type of the container struct this is embedded in.
+ * @member:    the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({                     \
+       const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
+       (type *)( (char *)__mptr - offsetof(type,member) );})
+
+struct rb_node
+{
+       unsigned long  rb_parent_color;
+#define        RB_RED          0
+#define        RB_BLACK        1
+       struct rb_node *rb_right;
+       struct rb_node *rb_left;
+} __attribute__((aligned(sizeof(long))));
+    /* The alignment might seem pointless, but allegedly CRIS needs it */
+
+struct rb_root
+{
+       struct rb_node *rb_node;
+};
+
+
+#define rb_parent(r)   ((struct rb_node *)((r)->rb_parent_color & ~3))
+#define rb_color(r)   ((r)->rb_parent_color & 1)
+#define rb_is_red(r)   (!rb_color(r))
+#define rb_is_black(r) rb_color(r)
+#define rb_set_red(r)  do { (r)->rb_parent_color &= ~1; } while (0)
+#define rb_set_black(r)  do { (r)->rb_parent_color |= 1; } while (0)
+
+static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
+{
+       rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p;
+}
+static inline void rb_set_color(struct rb_node *rb, int color)
+{
+       rb->rb_parent_color = (rb->rb_parent_color & ~1) | color;
+}
+
+#define RB_ROOT        (struct rb_root) { NULL, }
+#define        rb_entry(ptr, type, member) container_of(ptr, type, member)
+
+#define RB_EMPTY_ROOT(root)    ((root)->rb_node == NULL)
+#define RB_EMPTY_NODE(node)    (rb_parent(node) == node)
+#define RB_CLEAR_NODE(node)    (rb_set_parent(node, node))
+
+extern void rb_insert_color(struct rb_node *, struct rb_root *);
+extern void rb_erase(struct rb_node *, struct rb_root *);
+
+/* Find logical next and previous nodes in a tree */
+extern struct rb_node *rb_next(const struct rb_node *);
+extern struct rb_node *rb_prev(const struct rb_node *);
+extern struct rb_node *rb_first(const struct rb_root *);
+extern struct rb_node *rb_last(const struct rb_root *);
+
+/* Fast replacement of a single node without remove/rebalance/add/rebalance */
+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 
+                           struct rb_root *root);
+
+static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
+                               struct rb_node ** rb_link)
+{
+       node->rb_parent_color = (unsigned long )parent;
+       node->rb_left = node->rb_right = NULL;
+
+       *rb_link = node;
+}
+
+#endif /* _LINUX_RBTREE_H */
diff --git a/tools/perf/util/run-command.c b/tools/perf/util/run-command.c

new file mode 100644 (file)

index 0000000..b2f5e85
--- /dev/null
+++ b/tools/perf/util/run-command.c
@@ -0,0 +1,395 @@
+#include "cache.h"
+#include "run-command.h"
+#include "exec_cmd.h"
+
+static inline void close_pair(int fd[2])
+{
+       close(fd[0]);
+       close(fd[1]);
+}
+
+static inline void dup_devnull(int to)
+{
+       int fd = open("/dev/null", O_RDWR);
+       dup2(fd, to);
+       close(fd);
+}
+
+int start_command(struct child_process *cmd)
+{
+       int need_in, need_out, need_err;
+       int fdin[2], fdout[2], fderr[2];
+
+       /*
+        * In case of errors we must keep the promise to close FDs
+        * that have been passed in via ->in and ->out.
+        */
+
+       need_in = !cmd->no_stdin && cmd->in < 0;
+       if (need_in) {
+               if (pipe(fdin) < 0) {
+                       if (cmd->out > 0)
+                               close(cmd->out);
+                       return -ERR_RUN_COMMAND_PIPE;
+               }
+               cmd->in = fdin[1];
+       }
+
+       need_out = !cmd->no_stdout
+               && !cmd->stdout_to_stderr
+               && cmd->out < 0;
+       if (need_out) {
+               if (pipe(fdout) < 0) {
+                       if (need_in)
+                               close_pair(fdin);
+                       else if (cmd->in)
+                               close(cmd->in);
+                       return -ERR_RUN_COMMAND_PIPE;
+               }
+               cmd->out = fdout[0];
+       }
+
+       need_err = !cmd->no_stderr && cmd->err < 0;
+       if (need_err) {
+               if (pipe(fderr) < 0) {
+                       if (need_in)
+                               close_pair(fdin);
+                       else if (cmd->in)
+                               close(cmd->in);
+                       if (need_out)
+                               close_pair(fdout);
+                       else if (cmd->out)
+                               close(cmd->out);
+                       return -ERR_RUN_COMMAND_PIPE;
+               }
+               cmd->err = fderr[0];
+       }
+
+#ifndef __MINGW32__
+       fflush(NULL);
+       cmd->pid = fork();
+       if (!cmd->pid) {
+               if (cmd->no_stdin)
+                       dup_devnull(0);
+               else if (need_in) {
+                       dup2(fdin[0], 0);
+                       close_pair(fdin);
+               } else if (cmd->in) {
+                       dup2(cmd->in, 0);
+                       close(cmd->in);
+               }
+
+               if (cmd->no_stderr)
+                       dup_devnull(2);
+               else if (need_err) {
+                       dup2(fderr[1], 2);
+                       close_pair(fderr);
+               }
+
+               if (cmd->no_stdout)
+                       dup_devnull(1);
+               else if (cmd->stdout_to_stderr)
+                       dup2(2, 1);
+               else if (need_out) {
+                       dup2(fdout[1], 1);
+                       close_pair(fdout);
+               } else if (cmd->out > 1) {
+                       dup2(cmd->out, 1);
+                       close(cmd->out);
+               }
+
+               if (cmd->dir && chdir(cmd->dir))
+                       die("exec %s: cd to %s failed (%s)", cmd->argv[0],
+                           cmd->dir, strerror(errno));
+               if (cmd->env) {
+                       for (; *cmd->env; cmd->env++) {
+                               if (strchr(*cmd->env, '='))
+                                       putenv((char*)*cmd->env);
+                               else
+                                       unsetenv(*cmd->env);
+                       }
+               }
+               if (cmd->preexec_cb)
+                       cmd->preexec_cb();
+               if (cmd->perf_cmd) {
+                       execv_perf_cmd(cmd->argv);
+               } else {
+                       execvp(cmd->argv[0], (char *const*) cmd->argv);
+               }
+               exit(127);
+       }
+#else
+       int s0 = -1, s1 = -1, s2 = -1;  /* backups of stdin, stdout, stderr */
+       const char **sargv = cmd->argv;
+       char **env = environ;
+
+       if (cmd->no_stdin) {
+               s0 = dup(0);
+               dup_devnull(0);
+       } else if (need_in) {
+               s0 = dup(0);
+               dup2(fdin[0], 0);
+       } else if (cmd->in) {
+               s0 = dup(0);
+               dup2(cmd->in, 0);
+       }
+
+       if (cmd->no_stderr) {
+               s2 = dup(2);
+               dup_devnull(2);
+       } else if (need_err) {
+               s2 = dup(2);
+               dup2(fderr[1], 2);
+       }
+
+       if (cmd->no_stdout) {
+               s1 = dup(1);
+               dup_devnull(1);
+       } else if (cmd->stdout_to_stderr) {
+               s1 = dup(1);
+               dup2(2, 1);
+       } else if (need_out) {
+               s1 = dup(1);
+               dup2(fdout[1], 1);
+       } else if (cmd->out > 1) {
+               s1 = dup(1);
+               dup2(cmd->out, 1);
+       }
+
+       if (cmd->dir)
+               die("chdir in start_command() not implemented");
+       if (cmd->env) {
+               env = copy_environ();
+               for (; *cmd->env; cmd->env++)
+                       env = env_setenv(env, *cmd->env);
+       }
+
+       if (cmd->perf_cmd) {
+               cmd->argv = prepare_perf_cmd(cmd->argv);
+       }
+
+       cmd->pid = mingw_spawnvpe(cmd->argv[0], cmd->argv, env);
+
+       if (cmd->env)
+               free_environ(env);
+       if (cmd->perf_cmd)
+               free(cmd->argv);
+
+       cmd->argv = sargv;
+       if (s0 >= 0)
+               dup2(s0, 0), close(s0);
+       if (s1 >= 0)
+               dup2(s1, 1), close(s1);
+       if (s2 >= 0)
+               dup2(s2, 2), close(s2);
+#endif
+
+       if (cmd->pid < 0) {
+               int err = errno;
+               if (need_in)
+                       close_pair(fdin);
+               else if (cmd->in)
+                       close(cmd->in);
+               if (need_out)
+                       close_pair(fdout);
+               else if (cmd->out)
+                       close(cmd->out);
+               if (need_err)
+                       close_pair(fderr);
+               return err == ENOENT ?
+                       -ERR_RUN_COMMAND_EXEC :
+                       -ERR_RUN_COMMAND_FORK;
+       }
+
+       if (need_in)
+               close(fdin[0]);
+       else if (cmd->in)
+               close(cmd->in);
+
+       if (need_out)
+               close(fdout[1]);
+       else if (cmd->out)
+               close(cmd->out);
+
+       if (need_err)
+               close(fderr[1]);
+
+       return 0;
+}
+
+static int wait_or_whine(pid_t pid)
+{
+       for (;;) {
+               int status, code;
+               pid_t waiting = waitpid(pid, &status, 0);
+
+               if (waiting < 0) {
+                       if (errno == EINTR)
+                               continue;
+                       error("waitpid failed (%s)", strerror(errno));
+                       return -ERR_RUN_COMMAND_WAITPID;
+               }
+               if (waiting != pid)
+                       return -ERR_RUN_COMMAND_WAITPID_WRONG_PID;
+               if (WIFSIGNALED(status))
+                       return -ERR_RUN_COMMAND_WAITPID_SIGNAL;
+
+               if (!WIFEXITED(status))
+                       return -ERR_RUN_COMMAND_WAITPID_NOEXIT;
+               code = WEXITSTATUS(status);
+               switch (code) {
+               case 127:
+                       return -ERR_RUN_COMMAND_EXEC;
+               case 0:
+                       return 0;
+               default:
+                       return -code;
+               }
+       }
+}
+
+int finish_command(struct child_process *cmd)
+{
+       return wait_or_whine(cmd->pid);
+}
+
+int run_command(struct child_process *cmd)
+{
+       int code = start_command(cmd);
+       if (code)
+               return code;
+       return finish_command(cmd);
+}
+
+static void prepare_run_command_v_opt(struct child_process *cmd,
+                                     const char **argv,
+                                     int opt)
+{
+       memset(cmd, 0, sizeof(*cmd));
+       cmd->argv = argv;
+       cmd->no_stdin = opt & RUN_COMMAND_NO_STDIN ? 1 : 0;
+       cmd->perf_cmd = opt & RUN_PERF_CMD ? 1 : 0;
+       cmd->stdout_to_stderr = opt & RUN_COMMAND_STDOUT_TO_STDERR ? 1 : 0;
+}
+
+int run_command_v_opt(const char **argv, int opt)
+{
+       struct child_process cmd;
+       prepare_run_command_v_opt(&cmd, argv, opt);
+       return run_command(&cmd);
+}
+
+int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env)
+{
+       struct child_process cmd;
+       prepare_run_command_v_opt(&cmd, argv, opt);
+       cmd.dir = dir;
+       cmd.env = env;
+       return run_command(&cmd);
+}
+
+#ifdef __MINGW32__
+static __stdcall unsigned run_thread(void *data)
+{
+       struct async *async = data;
+       return async->proc(async->fd_for_proc, async->data);
+}
+#endif
+
+int start_async(struct async *async)
+{
+       int pipe_out[2];
+
+       if (pipe(pipe_out) < 0)
+               return error("cannot create pipe: %s", strerror(errno));
+       async->out = pipe_out[0];
+
+#ifndef __MINGW32__
+       /* Flush stdio before fork() to avoid cloning buffers */
+       fflush(NULL);
+
+       async->pid = fork();
+       if (async->pid < 0) {
+               error("fork (async) failed: %s", strerror(errno));
+               close_pair(pipe_out);
+               return -1;
+       }
+       if (!async->pid) {
+               close(pipe_out[0]);
+               exit(!!async->proc(pipe_out[1], async->data));
+       }
+       close(pipe_out[1]);
+#else
+       async->fd_for_proc = pipe_out[1];
+       async->tid = (HANDLE) _beginthreadex(NULL, 0, run_thread, async, 0, NULL);
+       if (!async->tid) {
+               error("cannot create thread: %s", strerror(errno));
+               close_pair(pipe_out);
+               return -1;
+       }
+#endif
+       return 0;
+}
+
+int finish_async(struct async *async)
+{
+#ifndef __MINGW32__
+       int ret = 0;
+
+       if (wait_or_whine(async->pid))
+               ret = error("waitpid (async) failed");
+#else
+       DWORD ret = 0;
+       if (WaitForSingleObject(async->tid, INFINITE) != WAIT_OBJECT_0)
+               ret = error("waiting for thread failed: %lu", GetLastError());
+       else if (!GetExitCodeThread(async->tid, &ret))
+               ret = error("cannot get thread exit code: %lu", GetLastError());
+       CloseHandle(async->tid);
+#endif
+       return ret;
+}
+
+int run_hook(const char *index_file, const char *name, ...)
+{
+       struct child_process hook;
+       const char **argv = NULL, *env[2];
+       char index[PATH_MAX];
+       va_list args;
+       int ret;
+       size_t i = 0, alloc = 0;
+
+       if (access(perf_path("hooks/%s", name), X_OK) < 0)
+               return 0;
+
+       va_start(args, name);
+       ALLOC_GROW(argv, i + 1, alloc);
+       argv[i++] = perf_path("hooks/%s", name);
+       while (argv[i-1]) {
+               ALLOC_GROW(argv, i + 1, alloc);
+               argv[i++] = va_arg(args, const char *);
+       }
+       va_end(args);
+
+       memset(&hook, 0, sizeof(hook));
+       hook.argv = argv;
+       hook.no_stdin = 1;
+       hook.stdout_to_stderr = 1;
+       if (index_file) {
+               snprintf(index, sizeof(index), "PERF_INDEX_FILE=%s", index_file);
+               env[0] = index;
+               env[1] = NULL;
+               hook.env = env;
+       }
+
+       ret = start_command(&hook);
+       free(argv);
+       if (ret) {
+               warning("Could not spawn %s", argv[0]);
+               return ret;
+       }
+       ret = finish_command(&hook);
+       if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
+               warning("%s exited due to uncaught signal", argv[0]);
+
+       return ret;
+}
diff --git a/tools/perf/util/run-command.h b/tools/perf/util/run-command.h

new file mode 100644 (file)

index 0000000..328289f
--- /dev/null
+++ b/tools/perf/util/run-command.h
@@ -0,0 +1,93 @@
+#ifndef RUN_COMMAND_H
+#define RUN_COMMAND_H
+
+enum {
+       ERR_RUN_COMMAND_FORK = 10000,
+       ERR_RUN_COMMAND_EXEC,
+       ERR_RUN_COMMAND_PIPE,
+       ERR_RUN_COMMAND_WAITPID,
+       ERR_RUN_COMMAND_WAITPID_WRONG_PID,
+       ERR_RUN_COMMAND_WAITPID_SIGNAL,
+       ERR_RUN_COMMAND_WAITPID_NOEXIT,
+};
+#define IS_RUN_COMMAND_ERR(x) (-(x) >= ERR_RUN_COMMAND_FORK)
+
+struct child_process {
+       const char **argv;
+       pid_t pid;
+       /*
+        * Using .in, .out, .err:
+        * - Specify 0 for no redirections (child inherits stdin, stdout,
+        *   stderr from parent).
+        * - Specify -1 to have a pipe allocated as follows:
+        *     .in: returns the writable pipe end; parent writes to it,
+        *          the readable pipe end becomes child's stdin
+        *     .out, .err: returns the readable pipe end; parent reads from
+        *          it, the writable pipe end becomes child's stdout/stderr
+        *   The caller of start_command() must close the returned FDs
+        *   after it has completed reading from/writing to it!
+        * - Specify > 0 to set a channel to a particular FD as follows:
+        *     .in: a readable FD, becomes child's stdin
+        *     .out: a writable FD, becomes child's stdout/stderr
+        *     .err > 0 not supported
+        *   The specified FD is closed by start_command(), even in case
+        *   of errors!
+        */
+       int in;
+       int out;
+       int err;
+       const char *dir;
+       const char *const *env;
+       unsigned no_stdin:1;
+       unsigned no_stdout:1;
+       unsigned no_stderr:1;
+       unsigned perf_cmd:1; /* if this is to be perf sub-command */
+       unsigned stdout_to_stderr:1;
+       void (*preexec_cb)(void);
+};
+
+int start_command(struct child_process *);
+int finish_command(struct child_process *);
+int run_command(struct child_process *);
+
+extern int run_hook(const char *index_file, const char *name, ...);
+
+#define RUN_COMMAND_NO_STDIN 1
+#define RUN_PERF_CMD        2  /*If this is to be perf sub-command */
+#define RUN_COMMAND_STDOUT_TO_STDERR 4
+int run_command_v_opt(const char **argv, int opt);
+
+/*
+ * env (the environment) is to be formatted like environ: "VAR=VALUE".
+ * To unset an environment variable use just "VAR".
+ */
+int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env);
+
+/*
+ * The purpose of the following functions is to feed a pipe by running
+ * a function asynchronously and providing output that the caller reads.
+ *
+ * It is expected that no synchronization and mutual exclusion between
+ * the caller and the feed function is necessary so that the function
+ * can run in a thread without interfering with the caller.
+ */
+struct async {
+       /*
+        * proc writes to fd and closes it;
+        * returns 0 on success, non-zero on failure
+        */
+       int (*proc)(int fd, void *data);
+       void *data;
+       int out;        /* caller reads from here and closes it */
+#ifndef __MINGW32__
+       pid_t pid;
+#else
+       HANDLE tid;
+       int fd_for_proc;
+#endif
+};
+
+int start_async(struct async *async);
+int finish_async(struct async *async);
+
+#endif
diff --git a/tools/perf/util/sigchain.c b/tools/perf/util/sigchain.c

new file mode 100644 (file)

index 0000000..1118b99
--- /dev/null
+++ b/tools/perf/util/sigchain.c
@@ -0,0 +1,52 @@
+#include "sigchain.h"
+#include "cache.h"
+
+#define SIGCHAIN_MAX_SIGNALS 32
+
+struct sigchain_signal {
+       sigchain_fun *old;
+       int n;
+       int alloc;
+};
+static struct sigchain_signal signals[SIGCHAIN_MAX_SIGNALS];
+
+static void check_signum(int sig)
+{
+       if (sig < 1 || sig >= SIGCHAIN_MAX_SIGNALS)
+               die("BUG: signal out of range: %d", sig);
+}
+
+int sigchain_push(int sig, sigchain_fun f)
+{
+       struct sigchain_signal *s = signals + sig;
+       check_signum(sig);
+
+       ALLOC_GROW(s->old, s->n + 1, s->alloc);
+       s->old[s->n] = signal(sig, f);
+       if (s->old[s->n] == SIG_ERR)
+               return -1;
+       s->n++;
+       return 0;
+}
+
+int sigchain_pop(int sig)
+{
+       struct sigchain_signal *s = signals + sig;
+       check_signum(sig);
+       if (s->n < 1)
+               return 0;
+
+       if (signal(sig, s->old[s->n - 1]) == SIG_ERR)
+               return -1;
+       s->n--;
+       return 0;
+}
+
+void sigchain_push_common(sigchain_fun f)
+{
+       sigchain_push(SIGINT, f);
+       sigchain_push(SIGHUP, f);
+       sigchain_push(SIGTERM, f);
+       sigchain_push(SIGQUIT, f);
+       sigchain_push(SIGPIPE, f);
+}
diff --git a/tools/perf/util/sigchain.h b/tools/perf/util/sigchain.h

new file mode 100644 (file)

index 0000000..618083b
--- /dev/null
+++ b/tools/perf/util/sigchain.h
@@ -0,0 +1,11 @@
+#ifndef SIGCHAIN_H
+#define SIGCHAIN_H
+
+typedef void (*sigchain_fun)(int);
+
+int sigchain_push(int sig, sigchain_fun f);
+int sigchain_pop(int sig);
+
+void sigchain_push_common(sigchain_fun f);
+
+#endif /* SIGCHAIN_H */
diff --git a/tools/perf/util/strbuf.c b/tools/perf/util/strbuf.c

new file mode 100644 (file)

index 0000000..eaba093
--- /dev/null
+++ b/tools/perf/util/strbuf.c
@@ -0,0 +1,359 @@
+#include "cache.h"
+
+int prefixcmp(const char *str, const char *prefix)
+{
+       for (; ; str++, prefix++)
+               if (!*prefix)
+                       return 0;
+               else if (*str != *prefix)
+                       return (unsigned char)*prefix - (unsigned char)*str;
+}
+
+/*
+ * Used as the default ->buf value, so that people can always assume
+ * buf is non NULL and ->buf is NUL terminated even for a freshly
+ * initialized strbuf.
+ */
+char strbuf_slopbuf[1];
+
+void strbuf_init(struct strbuf *sb, size_t hint)
+{
+       sb->alloc = sb->len = 0;
+       sb->buf = strbuf_slopbuf;
+       if (hint)
+               strbuf_grow(sb, hint);
+}
+
+void strbuf_release(struct strbuf *sb)
+{
+       if (sb->alloc) {
+               free(sb->buf);
+               strbuf_init(sb, 0);
+       }
+}
+
+char *strbuf_detach(struct strbuf *sb, size_t *sz)
+{
+       char *res = sb->alloc ? sb->buf : NULL;
+       if (sz)
+               *sz = sb->len;
+       strbuf_init(sb, 0);
+       return res;
+}
+
+void strbuf_attach(struct strbuf *sb, void *buf, size_t len, size_t alloc)
+{
+       strbuf_release(sb);
+       sb->buf   = buf;
+       sb->len   = len;
+       sb->alloc = alloc;
+       strbuf_grow(sb, 0);
+       sb->buf[sb->len] = '\0';
+}
+
+void strbuf_grow(struct strbuf *sb, size_t extra)
+{
+       if (sb->len + extra + 1 <= sb->len)
+               die("you want to use way too much memory");
+       if (!sb->alloc)
+               sb->buf = NULL;
+       ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc);
+}
+
+void strbuf_trim(struct strbuf *sb)
+{
+       char *b = sb->buf;
+       while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
+               sb->len--;
+       while (sb->len > 0 && isspace(*b)) {
+               b++;
+               sb->len--;
+       }
+       memmove(sb->buf, b, sb->len);
+       sb->buf[sb->len] = '\0';
+}
+void strbuf_rtrim(struct strbuf *sb)
+{
+       while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
+               sb->len--;
+       sb->buf[sb->len] = '\0';
+}
+
+void strbuf_ltrim(struct strbuf *sb)
+{
+       char *b = sb->buf;
+       while (sb->len > 0 && isspace(*b)) {
+               b++;
+               sb->len--;
+       }
+       memmove(sb->buf, b, sb->len);
+       sb->buf[sb->len] = '\0';
+}
+
+void strbuf_tolower(struct strbuf *sb)
+{
+       int i;
+       for (i = 0; i < sb->len; i++)
+               sb->buf[i] = tolower(sb->buf[i]);
+}
+
+struct strbuf **strbuf_split(const struct strbuf *sb, int delim)
+{
+       int alloc = 2, pos = 0;
+       char *n, *p;
+       struct strbuf **ret;
+       struct strbuf *t;
+
+       ret = calloc(alloc, sizeof(struct strbuf *));
+       p = n = sb->buf;
+       while (n < sb->buf + sb->len) {
+               int len;
+               n = memchr(n, delim, sb->len - (n - sb->buf));
+               if (pos + 1 >= alloc) {
+                       alloc = alloc * 2;
+                       ret = realloc(ret, sizeof(struct strbuf *) * alloc);
+               }
+               if (!n)
+                       n = sb->buf + sb->len - 1;
+               len = n - p + 1;
+               t = malloc(sizeof(struct strbuf));
+               strbuf_init(t, len);
+               strbuf_add(t, p, len);
+               ret[pos] = t;
+               ret[++pos] = NULL;
+               p = ++n;
+       }
+       return ret;
+}
+
+void strbuf_list_free(struct strbuf **sbs)
+{
+       struct strbuf **s = sbs;
+
+       while (*s) {
+               strbuf_release(*s);
+               free(*s++);
+       }
+       free(sbs);
+}
+
+int strbuf_cmp(const struct strbuf *a, const struct strbuf *b)
+{
+       int len = a->len < b->len ? a->len: b->len;
+       int cmp = memcmp(a->buf, b->buf, len);
+       if (cmp)
+               return cmp;
+       return a->len < b->len ? -1: a->len != b->len;
+}
+
+void strbuf_splice(struct strbuf *sb, size_t pos, size_t len,
+                                  const void *data, size_t dlen)
+{
+       if (pos + len < pos)
+               die("you want to use way too much memory");
+       if (pos > sb->len)
+               die("`pos' is too far after the end of the buffer");
+       if (pos + len > sb->len)
+               die("`pos + len' is too far after the end of the buffer");
+
+       if (dlen >= len)
+               strbuf_grow(sb, dlen - len);
+       memmove(sb->buf + pos + dlen,
+                       sb->buf + pos + len,
+                       sb->len - pos - len);
+       memcpy(sb->buf + pos, data, dlen);
+       strbuf_setlen(sb, sb->len + dlen - len);
+}
+
+void strbuf_insert(struct strbuf *sb, size_t pos, const void *data, size_t len)
+{
+       strbuf_splice(sb, pos, 0, data, len);
+}
+
+void strbuf_remove(struct strbuf *sb, size_t pos, size_t len)
+{
+       strbuf_splice(sb, pos, len, NULL, 0);
+}
+
+void strbuf_add(struct strbuf *sb, const void *data, size_t len)
+{
+       strbuf_grow(sb, len);
+       memcpy(sb->buf + sb->len, data, len);
+       strbuf_setlen(sb, sb->len + len);
+}
+
+void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len)
+{
+       strbuf_grow(sb, len);
+       memcpy(sb->buf + sb->len, sb->buf + pos, len);
+       strbuf_setlen(sb, sb->len + len);
+}
+
+void strbuf_addf(struct strbuf *sb, const char *fmt, ...)
+{
+       int len;
+       va_list ap;
+
+       if (!strbuf_avail(sb))
+               strbuf_grow(sb, 64);
+       va_start(ap, fmt);
+       len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
+       va_end(ap);
+       if (len < 0)
+               die("your vsnprintf is broken");
+       if (len > strbuf_avail(sb)) {
+               strbuf_grow(sb, len);
+               va_start(ap, fmt);
+               len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
+               va_end(ap);
+               if (len > strbuf_avail(sb)) {
+                       die("this should not happen, your snprintf is broken");
+               }
+       }
+       strbuf_setlen(sb, sb->len + len);
+}
+
+void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn,
+                  void *context)
+{
+       for (;;) {
+               const char *percent;
+               size_t consumed;
+
+               percent = strchrnul(format, '%');
+               strbuf_add(sb, format, percent - format);
+               if (!*percent)
+                       break;
+               format = percent + 1;
+
+               consumed = fn(sb, format, context);
+               if (consumed)
+                       format += consumed;
+               else
+                       strbuf_addch(sb, '%');
+       }
+}
+
+size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder,
+               void *context)
+{
+       struct strbuf_expand_dict_entry *e = context;
+       size_t len;
+
+       for (; e->placeholder && (len = strlen(e->placeholder)); e++) {
+               if (!strncmp(placeholder, e->placeholder, len)) {
+                       if (e->value)
+                               strbuf_addstr(sb, e->value);
+                       return len;
+               }
+       }
+       return 0;
+}
+
+size_t strbuf_fread(struct strbuf *sb, size_t size, FILE *f)
+{
+       size_t res;
+       size_t oldalloc = sb->alloc;
+
+       strbuf_grow(sb, size);
+       res = fread(sb->buf + sb->len, 1, size, f);
+       if (res > 0)
+               strbuf_setlen(sb, sb->len + res);
+       else if (res < 0 && oldalloc == 0)
+               strbuf_release(sb);
+       return res;
+}
+
+ssize_t strbuf_read(struct strbuf *sb, int fd, size_t hint)
+{
+       size_t oldlen = sb->len;
+       size_t oldalloc = sb->alloc;
+
+       strbuf_grow(sb, hint ? hint : 8192);
+       for (;;) {
+               ssize_t cnt;
+
+               cnt = read(fd, sb->buf + sb->len, sb->alloc - sb->len - 1);
+               if (cnt < 0) {
+                       if (oldalloc == 0)
+                               strbuf_release(sb);
+                       else
+                               strbuf_setlen(sb, oldlen);
+                       return -1;
+               }
+               if (!cnt)
+                       break;
+               sb->len += cnt;
+               strbuf_grow(sb, 8192);
+       }
+
+       sb->buf[sb->len] = '\0';
+       return sb->len - oldlen;
+}
+
+#define STRBUF_MAXLINK (2*PATH_MAX)
+
+int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint)
+{
+       size_t oldalloc = sb->alloc;
+
+       if (hint < 32)
+               hint = 32;
+
+       while (hint < STRBUF_MAXLINK) {
+               int len;
+
+               strbuf_grow(sb, hint);
+               len = readlink(path, sb->buf, hint);
+               if (len < 0) {
+                       if (errno != ERANGE)
+                               break;
+               } else if (len < hint) {
+                       strbuf_setlen(sb, len);
+                       return 0;
+               }
+
+               /* .. the buffer was too small - try again */
+               hint *= 2;
+       }
+       if (oldalloc == 0)
+               strbuf_release(sb);
+       return -1;
+}
+
+int strbuf_getline(struct strbuf *sb, FILE *fp, int term)
+{
+       int ch;
+
+       strbuf_grow(sb, 0);
+       if (feof(fp))
+               return EOF;
+
+       strbuf_reset(sb);
+       while ((ch = fgetc(fp)) != EOF) {
+               if (ch == term)
+                       break;
+               strbuf_grow(sb, 1);
+               sb->buf[sb->len++] = ch;
+       }
+       if (ch == EOF && sb->len == 0)
+               return EOF;
+
+       sb->buf[sb->len] = '\0';
+       return 0;
+}
+
+int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint)
+{
+       int fd, len;
+
+       fd = open(path, O_RDONLY);
+       if (fd < 0)
+               return -1;
+       len = strbuf_read(sb, fd, hint);
+       close(fd);
+       if (len < 0)
+               return -1;
+
+       return len;
+}
diff --git a/tools/perf/util/strbuf.h b/tools/perf/util/strbuf.h

new file mode 100644 (file)

index 0000000..9ee908a
--- /dev/null
+++ b/tools/perf/util/strbuf.h
@@ -0,0 +1,137 @@
+#ifndef STRBUF_H
+#define STRBUF_H
+
+/*
+ * Strbuf's can be use in many ways: as a byte array, or to store arbitrary
+ * long, overflow safe strings.
+ *
+ * Strbufs has some invariants that are very important to keep in mind:
+ *
+ * 1. the ->buf member is always malloc-ed, hence strbuf's can be used to
+ *    build complex strings/buffers whose final size isn't easily known.
+ *
+ *    It is NOT legal to copy the ->buf pointer away.
+ *    `strbuf_detach' is the operation that detachs a buffer from its shell
+ *    while keeping the shell valid wrt its invariants.
+ *
+ * 2. the ->buf member is a byte array that has at least ->len + 1 bytes
+ *    allocated. The extra byte is used to store a '\0', allowing the ->buf
+ *    member to be a valid C-string. Every strbuf function ensure this
+ *    invariant is preserved.
+ *
+ *    Note that it is OK to "play" with the buffer directly if you work it
+ *    that way:
+ *
+ *    strbuf_grow(sb, SOME_SIZE);
+ *       ... Here, the memory array starting at sb->buf, and of length
+ *       ... strbuf_avail(sb) is all yours, and you are sure that
+ *       ... strbuf_avail(sb) is at least SOME_SIZE.
+ *    strbuf_setlen(sb, sb->len + SOME_OTHER_SIZE);
+ *
+ *    Of course, SOME_OTHER_SIZE must be smaller or equal to strbuf_avail(sb).
+ *
+ *    Doing so is safe, though if it has to be done in many places, adding the
+ *    missing API to the strbuf module is the way to go.
+ *
+ *    XXX: do _not_ assume that the area that is yours is of size ->alloc - 1
+ *         even if it's true in the current implementation. Alloc is somehow a
+ *         "private" member that should not be messed with.
+ */
+
+#include <assert.h>
+
+extern char strbuf_slopbuf[];
+struct strbuf {
+       size_t alloc;
+       size_t len;
+       char *buf;
+};
+
+#define STRBUF_INIT  { 0, 0, strbuf_slopbuf }
+
+/*----- strbuf life cycle -----*/
+extern void strbuf_init(struct strbuf *, size_t);
+extern void strbuf_release(struct strbuf *);
+extern char *strbuf_detach(struct strbuf *, size_t *);
+extern void strbuf_attach(struct strbuf *, void *, size_t, size_t);
+static inline void strbuf_swap(struct strbuf *a, struct strbuf *b) {
+       struct strbuf tmp = *a;
+       *a = *b;
+       *b = tmp;
+}
+
+/*----- strbuf size related -----*/
+static inline size_t strbuf_avail(const struct strbuf *sb) {
+       return sb->alloc ? sb->alloc - sb->len - 1 : 0;
+}
+
+extern void strbuf_grow(struct strbuf *, size_t);
+
+static inline void strbuf_setlen(struct strbuf *sb, size_t len) {
+       if (!sb->alloc)
+               strbuf_grow(sb, 0);
+       assert(len < sb->alloc);
+       sb->len = len;
+       sb->buf[len] = '\0';
+}
+#define strbuf_reset(sb)  strbuf_setlen(sb, 0)
+
+/*----- content related -----*/
+extern void strbuf_trim(struct strbuf *);
+extern void strbuf_rtrim(struct strbuf *);
+extern void strbuf_ltrim(struct strbuf *);
+extern int strbuf_cmp(const struct strbuf *, const struct strbuf *);
+extern void strbuf_tolower(struct strbuf *);
+
+extern struct strbuf **strbuf_split(const struct strbuf *, int delim);
+extern void strbuf_list_free(struct strbuf **);
+
+/*----- add data in your buffer -----*/
+static inline void strbuf_addch(struct strbuf *sb, int c) {
+       strbuf_grow(sb, 1);
+       sb->buf[sb->len++] = c;
+       sb->buf[sb->len] = '\0';
+}
+
+extern void strbuf_insert(struct strbuf *, size_t pos, const void *, size_t);
+extern void strbuf_remove(struct strbuf *, size_t pos, size_t len);
+
+/* splice pos..pos+len with given data */
+extern void strbuf_splice(struct strbuf *, size_t pos, size_t len,
+                          const void *, size_t);
+
+extern void strbuf_add(struct strbuf *, const void *, size_t);
+static inline void strbuf_addstr(struct strbuf *sb, const char *s) {
+       strbuf_add(sb, s, strlen(s));
+}
+static inline void strbuf_addbuf(struct strbuf *sb, const struct strbuf *sb2) {
+       strbuf_add(sb, sb2->buf, sb2->len);
+}
+extern void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len);
+
+typedef size_t (*expand_fn_t) (struct strbuf *sb, const char *placeholder, void *context);
+extern void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn, void *context);
+struct strbuf_expand_dict_entry {
+       const char *placeholder;
+       const char *value;
+};
+extern size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder, void *context);
+
+__attribute__((format(printf,2,3)))
+extern void strbuf_addf(struct strbuf *sb, const char *fmt, ...);
+
+extern size_t strbuf_fread(struct strbuf *, size_t, FILE *);
+/* XXX: if read fails, any partial read is undone */
+extern ssize_t strbuf_read(struct strbuf *, int fd, size_t hint);
+extern int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint);
+extern int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint);
+
+extern int strbuf_getline(struct strbuf *, FILE *, int);
+
+extern void stripspace(struct strbuf *buf, int skip_comments);
+extern int launch_editor(const char *path, struct strbuf *buffer, const char *const *env);
+
+extern int strbuf_branchname(struct strbuf *sb, const char *name);
+extern int strbuf_check_branch_ref(struct strbuf *sb, const char *name);
+
+#endif /* STRBUF_H */
diff --git a/tools/perf/util/string.c b/tools/perf/util/string.c

new file mode 100644 (file)

index 0000000..ec33c0c
--- /dev/null
+++ b/tools/perf/util/string.c
@@ -0,0 +1,34 @@
+#include "string.h"
+
+static int hex(char ch)
+{
+       if ((ch >= '0') && (ch <= '9'))
+               return ch - '0';
+       if ((ch >= 'a') && (ch <= 'f'))
+               return ch - 'a' + 10;
+       if ((ch >= 'A') && (ch <= 'F'))
+               return ch - 'A' + 10;
+       return -1;
+}
+
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+int hex2u64(const char *ptr, __u64 *long_val)
+{
+       const char *p = ptr;
+       *long_val = 0;
+
+       while (*p) {
+               const int hex_val = hex(*p);
+
+               if (hex_val < 0)
+                       break;
+
+               *long_val = (*long_val << 4) | hex_val;
+               p++;
+       }
+
+       return p - ptr;
+}
diff --git a/tools/perf/util/string.h b/tools/perf/util/string.h

new file mode 100644 (file)

index 0000000..72812c1
--- /dev/null
+++ b/tools/perf/util/string.h
@@ -0,0 +1,8 @@
+#ifndef _PERF_STRING_H_
+#define _PERF_STRING_H_
+
+#include <linux/types.h>
+
+int hex2u64(const char *ptr, __u64 *val);
+
+#endif
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c

new file mode 100644 (file)

index 0000000..49a55f8
--- /dev/null
+++ b/tools/perf/util/symbol.c
@@ -0,0 +1,641 @@
+#include "util.h"
+#include "../perf.h"
+#include "string.h"
+#include "symbol.h"
+
+#include <libelf.h>
+#include <gelf.h>
+#include <elf.h>
+
+const char *sym_hist_filter;
+
+static struct symbol *symbol__new(__u64 start, __u64 len,
+                                 const char *name, unsigned int priv_size,
+                                 __u64 obj_start, int verbose)
+{
+       size_t namelen = strlen(name) + 1;
+       struct symbol *self = calloc(1, priv_size + sizeof(*self) + namelen);
+
+       if (!self)
+               return NULL;
+
+       if (verbose >= 2)
+               printf("new symbol: %016Lx [%08lx]: %s, hist: %p, obj_start: %p\n",
+                       (__u64)start, (unsigned long)len, name, self->hist, (void *)(unsigned long)obj_start);
+
+       self->obj_start= obj_start;
+       self->hist = NULL;
+       self->hist_sum = 0;
+
+       if (sym_hist_filter && !strcmp(name, sym_hist_filter))
+               self->hist = calloc(sizeof(__u64), len);
+
+       if (priv_size) {
+               memset(self, 0, priv_size);
+               self = ((void *)self) + priv_size;
+       }
+       self->start = start;
+       self->end   = start + len - 1;
+       memcpy(self->name, name, namelen);
+
+       return self;
+}
+
+static void symbol__delete(struct symbol *self, unsigned int priv_size)
+{
+       free(((void *)self) - priv_size);
+}
+
+static size_t symbol__fprintf(struct symbol *self, FILE *fp)
+{
+       return fprintf(fp, " %llx-%llx %s\n",
+                      self->start, self->end, self->name);
+}
+
+struct dso *dso__new(const char *name, unsigned int sym_priv_size)
+{
+       struct dso *self = malloc(sizeof(*self) + strlen(name) + 1);
+
+       if (self != NULL) {
+               strcpy(self->name, name);
+               self->syms = RB_ROOT;
+               self->sym_priv_size = sym_priv_size;
+               self->find_symbol = dso__find_symbol;
+       }
+
+       return self;
+}
+
+static void dso__delete_symbols(struct dso *self)
+{
+       struct symbol *pos;
+       struct rb_node *next = rb_first(&self->syms);
+
+       while (next) {
+               pos = rb_entry(next, struct symbol, rb_node);
+               next = rb_next(&pos->rb_node);
+               rb_erase(&pos->rb_node, &self->syms);
+               symbol__delete(pos, self->sym_priv_size);
+       }
+}
+
+void dso__delete(struct dso *self)
+{
+       dso__delete_symbols(self);
+       free(self);
+}
+
+static void dso__insert_symbol(struct dso *self, struct symbol *sym)
+{
+       struct rb_node **p = &self->syms.rb_node;
+       struct rb_node *parent = NULL;
+       const __u64 ip = sym->start;
+       struct symbol *s;
+
+       while (*p != NULL) {
+               parent = *p;
+               s = rb_entry(parent, struct symbol, rb_node);
+               if (ip < s->start)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
+       }
+       rb_link_node(&sym->rb_node, parent, p);
+       rb_insert_color(&sym->rb_node, &self->syms);
+}
+
+struct symbol *dso__find_symbol(struct dso *self, __u64 ip)
+{
+       struct rb_node *n;
+
+       if (self == NULL)
+               return NULL;
+
+       n = self->syms.rb_node;
+
+       while (n) {
+               struct symbol *s = rb_entry(n, struct symbol, rb_node);
+
+               if (ip < s->start)
+                       n = n->rb_left;
+               else if (ip > s->end)
+                       n = n->rb_right;
+               else
+                       return s;
+       }
+
+       return NULL;
+}
+
+size_t dso__fprintf(struct dso *self, FILE *fp)
+{
+       size_t ret = fprintf(fp, "dso: %s\n", self->name);
+
+       struct rb_node *nd;
+       for (nd = rb_first(&self->syms); nd; nd = rb_next(nd)) {
+               struct symbol *pos = rb_entry(nd, struct symbol, rb_node);
+               ret += symbol__fprintf(pos, fp);
+       }
+
+       return ret;
+}
+
+static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter, int verbose)
+{
+       struct rb_node *nd, *prevnd;
+       char *line = NULL;
+       size_t n;
+       FILE *file = fopen("/proc/kallsyms", "r");
+
+       if (file == NULL)
+               goto out_failure;
+
+       while (!feof(file)) {
+               __u64 start;
+               struct symbol *sym;
+               int line_len, len;
+               char symbol_type;
+
+               line_len = getline(&line, &n, file);
+               if (line_len < 0)
+                       break;
+
+               if (!line)
+                       goto out_failure;
+
+               line[--line_len] = '\0'; /* \n */
+
+               len = hex2u64(line, &start);
+
+               len++;
+               if (len + 2 >= line_len)
+                       continue;
+
+               symbol_type = toupper(line[len]);
+               /*
+                * We're interested only in code ('T'ext)
+                */
+               if (symbol_type != 'T' && symbol_type != 'W')
+                       continue;
+               /*
+                * Well fix up the end later, when we have all sorted.
+                */
+               sym = symbol__new(start, 0xdead, line + len + 2,
+                                 self->sym_priv_size, 0, verbose);
+
+               if (sym == NULL)
+                       goto out_delete_line;
+
+               if (filter && filter(self, sym))
+                       symbol__delete(sym, self->sym_priv_size);
+               else
+                       dso__insert_symbol(self, sym);
+       }
+
+       /*
+        * Now that we have all sorted out, just set the ->end of all
+        * symbols
+        */
+       prevnd = rb_first(&self->syms);
+
+       if (prevnd == NULL)
+               goto out_delete_line;
+
+       for (nd = rb_next(prevnd); nd; nd = rb_next(nd)) {
+               struct symbol *prev = rb_entry(prevnd, struct symbol, rb_node),
+                             *curr = rb_entry(nd, struct symbol, rb_node);
+
+               prev->end = curr->start - 1;
+               prevnd = nd;
+       }
+
+       free(line);
+       fclose(file);
+
+       return 0;
+
+out_delete_line:
+       free(line);
+out_failure:
+       return -1;
+}
+
+static int dso__load_perf_map(struct dso *self, symbol_filter_t filter, int verbose)
+{
+       char *line = NULL;
+       size_t n;
+       FILE *file;
+       int nr_syms = 0;
+
+       file = fopen(self->name, "r");
+       if (file == NULL)
+               goto out_failure;
+
+       while (!feof(file)) {
+               __u64 start, size;
+               struct symbol *sym;
+               int line_len, len;
+
+               line_len = getline(&line, &n, file);
+               if (line_len < 0)
+                       break;
+
+               if (!line)
+                       goto out_failure;
+
+               line[--line_len] = '\0'; /* \n */
+
+               len = hex2u64(line, &start);
+
+               len++;
+               if (len + 2 >= line_len)
+                       continue;
+
+               len += hex2u64(line + len, &size);
+
+               len++;
+               if (len + 2 >= line_len)
+                       continue;
+
+               sym = symbol__new(start, size, line + len,
+                                 self->sym_priv_size, start, verbose);
+
+               if (sym == NULL)
+                       goto out_delete_line;
+
+               if (filter && filter(self, sym))
+                       symbol__delete(sym, self->sym_priv_size);
+               else {
+                       dso__insert_symbol(self, sym);
+                       nr_syms++;
+               }
+       }
+
+       free(line);
+       fclose(file);
+
+       return nr_syms;
+
+out_delete_line:
+       free(line);
+out_failure:
+       return -1;
+}
+
+/**
+ * elf_symtab__for_each_symbol - iterate thru all the symbols
+ *
+ * @self: struct elf_symtab instance to iterate
+ * @index: uint32_t index
+ * @sym: GElf_Sym iterator
+ */
+#define elf_symtab__for_each_symbol(syms, nr_syms, index, sym) \
+       for (index = 0, gelf_getsym(syms, index, &sym);\
+            index < nr_syms; \
+            index++, gelf_getsym(syms, index, &sym))
+
+static inline uint8_t elf_sym__type(const GElf_Sym *sym)
+{
+       return GELF_ST_TYPE(sym->st_info);
+}
+
+static inline int elf_sym__is_function(const GElf_Sym *sym)
+{
+       return elf_sym__type(sym) == STT_FUNC &&
+              sym->st_name != 0 &&
+              sym->st_shndx != SHN_UNDEF &&
+              sym->st_size != 0;
+}
+
+static inline const char *elf_sym__name(const GElf_Sym *sym,
+                                       const Elf_Data *symstrs)
+{
+       return symstrs->d_buf + sym->st_name;
+}
+
+static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
+                                   GElf_Shdr *shp, const char *name,
+                                   size_t *index)
+{
+       Elf_Scn *sec = NULL;
+       size_t cnt = 1;
+
+       while ((sec = elf_nextscn(elf, sec)) != NULL) {
+               char *str;
+
+               gelf_getshdr(sec, shp);
+               str = elf_strptr(elf, ep->e_shstrndx, shp->sh_name);
+               if (!strcmp(name, str)) {
+                       if (index)
+                               *index = cnt;
+                       break;
+               }
+               ++cnt;
+       }
+
+       return sec;
+}
+
+#define elf_section__for_each_rel(reldata, pos, pos_mem, idx, nr_entries) \
+       for (idx = 0, pos = gelf_getrel(reldata, 0, &pos_mem); \
+            idx < nr_entries; \
+            ++idx, pos = gelf_getrel(reldata, idx, &pos_mem))
+
+#define elf_section__for_each_rela(reldata, pos, pos_mem, idx, nr_entries) \
+       for (idx = 0, pos = gelf_getrela(reldata, 0, &pos_mem); \
+            idx < nr_entries; \
+            ++idx, pos = gelf_getrela(reldata, idx, &pos_mem))
+
+static int dso__synthesize_plt_symbols(struct  dso *self, Elf *elf,
+                                      GElf_Ehdr *ehdr, Elf_Scn *scn_dynsym,
+                                      GElf_Shdr *shdr_dynsym,
+                                      size_t dynsym_idx, int verbose)
+{
+       uint32_t nr_rel_entries, idx;
+       GElf_Sym sym;
+       __u64 plt_offset;
+       GElf_Shdr shdr_plt;
+       struct symbol *f;
+       GElf_Shdr shdr_rel_plt;
+       Elf_Data *reldata, *syms, *symstrs;
+       Elf_Scn *scn_plt_rel, *scn_symstrs;
+       char sympltname[1024];
+       int nr = 0, symidx;
+
+       scn_plt_rel = elf_section_by_name(elf, ehdr, &shdr_rel_plt,
+                                         ".rela.plt", NULL);
+       if (scn_plt_rel == NULL) {
+               scn_plt_rel = elf_section_by_name(elf, ehdr, &shdr_rel_plt,
+                                                 ".rel.plt", NULL);
+               if (scn_plt_rel == NULL)
+                       return 0;
+       }
+
+       if (shdr_rel_plt.sh_link != dynsym_idx)
+               return 0;
+
+       if (elf_section_by_name(elf, ehdr, &shdr_plt, ".plt", NULL) == NULL)
+               return 0;
+
+       /*
+        * Fetch the relocation section to find the indexes to the GOT
+        * and the symbols in the .dynsym they refer to.
+        */
+       reldata = elf_getdata(scn_plt_rel, NULL);
+       if (reldata == NULL)
+               return -1;
+
+       syms = elf_getdata(scn_dynsym, NULL);
+       if (syms == NULL)
+               return -1;
+
+       scn_symstrs = elf_getscn(elf, shdr_dynsym->sh_link);
+       if (scn_symstrs == NULL)
+               return -1;
+
+       symstrs = elf_getdata(scn_symstrs, NULL);
+       if (symstrs == NULL)
+               return -1;
+
+       nr_rel_entries = shdr_rel_plt.sh_size / shdr_rel_plt.sh_entsize;
+       plt_offset = shdr_plt.sh_offset;
+
+       if (shdr_rel_plt.sh_type == SHT_RELA) {
+               GElf_Rela pos_mem, *pos;
+
+               elf_section__for_each_rela(reldata, pos, pos_mem, idx,
+                                          nr_rel_entries) {
+                       symidx = GELF_R_SYM(pos->r_info);
+                       plt_offset += shdr_plt.sh_entsize;
+                       gelf_getsym(syms, symidx, &sym);
+                       snprintf(sympltname, sizeof(sympltname),
+                                "%s@plt", elf_sym__name(&sym, symstrs));
+
+                       f = symbol__new(plt_offset, shdr_plt.sh_entsize,
+                                       sympltname, self->sym_priv_size, 0, verbose);
+                       if (!f)
+                               return -1;
+
+                       dso__insert_symbol(self, f);
+                       ++nr;
+               }
+       } else if (shdr_rel_plt.sh_type == SHT_REL) {
+               GElf_Rel pos_mem, *pos;
+               elf_section__for_each_rel(reldata, pos, pos_mem, idx,
+                                         nr_rel_entries) {
+                       symidx = GELF_R_SYM(pos->r_info);
+                       plt_offset += shdr_plt.sh_entsize;
+                       gelf_getsym(syms, symidx, &sym);
+                       snprintf(sympltname, sizeof(sympltname),
+                                "%s@plt", elf_sym__name(&sym, symstrs));
+
+                       f = symbol__new(plt_offset, shdr_plt.sh_entsize,
+                                       sympltname, self->sym_priv_size, 0, verbose);
+                       if (!f)
+                               return -1;
+
+                       dso__insert_symbol(self, f);
+                       ++nr;
+               }
+       } else {
+               /*
+                * TODO: There are still one more shdr_rel_plt.sh_type
+                * I have to investigate, but probably should be ignored.
+                */
+       }
+
+       return nr;
+}
+
+static int dso__load_sym(struct dso *self, int fd, const char *name,
+                        symbol_filter_t filter, int verbose)
+{
+       Elf_Data *symstrs;
+       uint32_t nr_syms;
+       int err = -1;
+       uint32_t index;
+       GElf_Ehdr ehdr;
+       GElf_Shdr shdr;
+       Elf_Data *syms;
+       GElf_Sym sym;
+       Elf_Scn *sec, *sec_dynsym;
+       Elf *elf;
+       size_t dynsym_idx;
+       int nr = 0;
+
+       elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
+       if (elf == NULL) {
+               if (verbose)
+                       fprintf(stderr, "%s: cannot read %s ELF file.\n",
+                               __func__, name);
+               goto out_close;
+       }
+
+       if (gelf_getehdr(elf, &ehdr) == NULL) {
+               if (verbose)
+                       fprintf(stderr, "%s: cannot get elf header.\n", __func__);
+               goto out_elf_end;
+       }
+
+       /*
+        * We need to check if we have a .dynsym, so that we can handle the
+        * .plt, synthesizing its symbols, that aren't on the symtabs (be it
+        * .dynsym or .symtab)
+        */
+       sec_dynsym = elf_section_by_name(elf, &ehdr, &shdr,
+                                        ".dynsym", &dynsym_idx);
+       if (sec_dynsym != NULL) {
+               nr = dso__synthesize_plt_symbols(self, elf, &ehdr,
+                                                sec_dynsym, &shdr,
+                                                dynsym_idx, verbose);
+               if (nr < 0)
+                       goto out_elf_end;
+       }
+
+       /*
+        * But if we have a full .symtab (that is a superset of .dynsym) we
+        * should add the symbols not in the .dynsyn
+        */
+       sec = elf_section_by_name(elf, &ehdr, &shdr, ".symtab", NULL);
+       if (sec == NULL) {
+               if (sec_dynsym == NULL)
+                       goto out_elf_end;
+
+               sec = sec_dynsym;
+               gelf_getshdr(sec, &shdr);
+       }
+
+       syms = elf_getdata(sec, NULL);
+       if (syms == NULL)
+               goto out_elf_end;
+
+       sec = elf_getscn(elf, shdr.sh_link);
+       if (sec == NULL)
+               goto out_elf_end;
+
+       symstrs = elf_getdata(sec, NULL);
+       if (symstrs == NULL)
+               goto out_elf_end;
+
+       nr_syms = shdr.sh_size / shdr.sh_entsize;
+
+       memset(&sym, 0, sizeof(sym));
+
+       elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
+               struct symbol *f;
+               __u64 obj_start;
+
+               if (!elf_sym__is_function(&sym))
+                       continue;
+
+               sec = elf_getscn(elf, sym.st_shndx);
+               if (!sec)
+                       goto out_elf_end;
+
+               gelf_getshdr(sec, &shdr);
+               obj_start = sym.st_value;
+
+               sym.st_value -= shdr.sh_addr - shdr.sh_offset;
+
+               f = symbol__new(sym.st_value, sym.st_size,
+                               elf_sym__name(&sym, symstrs),
+                               self->sym_priv_size, obj_start, verbose);
+               if (!f)
+                       goto out_elf_end;
+
+               if (filter && filter(self, f))
+                       symbol__delete(f, self->sym_priv_size);
+               else {
+                       dso__insert_symbol(self, f);
+                       nr++;
+               }
+       }
+
+       err = nr;
+out_elf_end:
+       elf_end(elf);
+out_close:
+       return err;
+}
+
+int dso__load(struct dso *self, symbol_filter_t filter, int verbose)
+{
+       int size = strlen(self->name) + sizeof("/usr/lib/debug%s.debug");
+       char *name = malloc(size);
+       int variant = 0;
+       int ret = -1;
+       int fd;
+
+       if (!name)
+               return -1;
+
+       if (strncmp(self->name, "/tmp/perf-", 10) == 0)
+               return dso__load_perf_map(self, filter, verbose);
+
+more:
+       do {
+               switch (variant) {
+               case 0: /* Fedora */
+                       snprintf(name, size, "/usr/lib/debug%s.debug", self->name);
+                       break;
+               case 1: /* Ubuntu */
+                       snprintf(name, size, "/usr/lib/debug%s", self->name);
+                       break;
+               case 2: /* Sane people */
+                       snprintf(name, size, "%s", self->name);
+                       break;
+
+               default:
+                       goto out;
+               }
+               variant++;
+
+               fd = open(name, O_RDONLY);
+       } while (fd < 0);
+
+       ret = dso__load_sym(self, fd, name, filter, verbose);
+       close(fd);
+
+       /*
+        * Some people seem to have debuginfo files _WITHOUT_ debug info!?!?
+        */
+       if (!ret)
+               goto more;
+
+out:
+       free(name);
+       return ret;
+}
+
+static int dso__load_vmlinux(struct dso *self, const char *vmlinux,
+                            symbol_filter_t filter, int verbose)
+{
+       int err, fd = open(vmlinux, O_RDONLY);
+
+       if (fd < 0)
+               return -1;
+
+       err = dso__load_sym(self, fd, vmlinux, filter, verbose);
+       close(fd);
+
+       return err;
+}
+
+int dso__load_kernel(struct dso *self, const char *vmlinux,
+                    symbol_filter_t filter, int verbose)
+{
+       int err = -1;
+
+       if (vmlinux)
+               err = dso__load_vmlinux(self, vmlinux, filter, verbose);
+
+       if (err)
+               err = dso__load_kallsyms(self, filter, verbose);
+
+       return err;
+}
+
+void symbol__init(void)
+{
+       elf_version(EV_CURRENT);
+}
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h

new file mode 100644 (file)

index 0000000..0d1292b
--- /dev/null
+++ b/tools/perf/util/symbol.h
@@ -0,0 +1,47 @@
+#ifndef _PERF_SYMBOL_
+#define _PERF_SYMBOL_ 1
+
+#include <linux/types.h>
+#include "list.h"
+#include "rbtree.h"
+
+struct symbol {
+       struct rb_node  rb_node;
+       __u64           start;
+       __u64           end;
+       __u64           obj_start;
+       __u64           hist_sum;
+       __u64           *hist;
+       char            name[0];
+};
+
+struct dso {
+       struct list_head node;
+       struct rb_root   syms;
+       unsigned int     sym_priv_size;
+       struct symbol    *(*find_symbol)(struct dso *, __u64 ip);
+       char             name[0];
+};
+
+const char *sym_hist_filter;
+
+typedef int (*symbol_filter_t)(struct dso *self, struct symbol *sym);
+
+struct dso *dso__new(const char *name, unsigned int sym_priv_size);
+void dso__delete(struct dso *self);
+
+static inline void *dso__sym_priv(struct dso *self, struct symbol *sym)
+{
+       return ((void *)sym) - self->sym_priv_size;
+}
+
+struct symbol *dso__find_symbol(struct dso *self, __u64 ip);
+
+int dso__load_kernel(struct dso *self, const char *vmlinux,
+                    symbol_filter_t filter, int verbose);
+int dso__load(struct dso *self, symbol_filter_t filter, int verbose);
+
+size_t dso__fprintf(struct dso *self, FILE *fp);
+
+void symbol__init(void);
+#endif /* _PERF_SYMBOL_ */
diff --git a/tools/perf/util/usage.c b/tools/perf/util/usage.c

new file mode 100644 (file)

index 0000000..e16bf9a
--- /dev/null
+++ b/tools/perf/util/usage.c
@@ -0,0 +1,80 @@
+/*
+ * GIT - The information manager from hell
+ *
+ * Copyright (C) Linus Torvalds, 2005
+ */
+#include "util.h"
+
+static void report(const char *prefix, const char *err, va_list params)
+{
+       char msg[1024];
+       vsnprintf(msg, sizeof(msg), err, params);
+       fprintf(stderr, " %s%s\n", prefix, msg);
+}
+
+static NORETURN void usage_builtin(const char *err)
+{
+       fprintf(stderr, "\n Usage: %s\n", err);
+       exit(129);
+}
+
+static NORETURN void die_builtin(const char *err, va_list params)
+{
+       report(" Fatal: ", err, params);
+       exit(128);
+}
+
+static void error_builtin(const char *err, va_list params)
+{
+       report(" Error: ", err, params);
+}
+
+static void warn_builtin(const char *warn, va_list params)
+{
+       report(" Warning: ", warn, params);
+}
+
+/* If we are in a dlopen()ed .so write to a global variable would segfault
+ * (ugh), so keep things static. */
+static void (*usage_routine)(const char *err) NORETURN = usage_builtin;
+static void (*die_routine)(const char *err, va_list params) NORETURN = die_builtin;
+static void (*error_routine)(const char *err, va_list params) = error_builtin;
+static void (*warn_routine)(const char *err, va_list params) = warn_builtin;
+
+void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN)
+{
+       die_routine = routine;
+}
+
+void usage(const char *err)
+{
+       usage_routine(err);
+}
+
+void die(const char *err, ...)
+{
+       va_list params;
+
+       va_start(params, err);
+       die_routine(err, params);
+       va_end(params);
+}
+
+int error(const char *err, ...)
+{
+       va_list params;
+
+       va_start(params, err);
+       error_routine(err, params);
+       va_end(params);
+       return -1;
+}
+
+void warning(const char *warn, ...)
+{
+       va_list params;
+
+       va_start(params, warn);
+       warn_routine(warn, params);
+       va_end(params);
+}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h

new file mode 100644 (file)

index 0000000..76590a1
--- /dev/null
+++ b/tools/perf/util/util.h
@@ -0,0 +1,410 @@
+#ifndef GIT_COMPAT_UTIL_H
+#define GIT_COMPAT_UTIL_H
+
+#define _FILE_OFFSET_BITS 64
+
+#ifndef FLEX_ARRAY
+/*
+ * See if our compiler is known to support flexible array members.
+ */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+# define FLEX_ARRAY /* empty */
+#elif defined(__GNUC__)
+# if (__GNUC__ >= 3)
+#  define FLEX_ARRAY /* empty */
+# else
+#  define FLEX_ARRAY 0 /* older GNU extension */
+# endif
+#endif
+
+/*
+ * Otherwise, default to safer but a bit wasteful traditional style
+ */
+#ifndef FLEX_ARRAY
+# define FLEX_ARRAY 1
+#endif
+#endif
+
+#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
+
+#ifdef __GNUC__
+#define TYPEOF(x) (__typeof__(x))
+#else
+#define TYPEOF(x)
+#endif
+
+#define MSB(x, bits) ((x) & TYPEOF(x)(~0ULL << (sizeof(x) * 8 - (bits))))
+#define HAS_MULTI_BITS(i)  ((i) & ((i) - 1))  /* checks if an integer has more than 1 bit set */
+
+/* Approximation of the length of the decimal representation of this type. */
+#define decimal_length(x)      ((int)(sizeof(x) * 2.56 + 0.5) + 1)
+
+#if !defined(__APPLE__) && !defined(__FreeBSD__)  && !defined(__USLC__) && !defined(_M_UNIX)
+#define _XOPEN_SOURCE 600 /* glibc2 and AIX 5.3L need 500, OpenBSD needs 600 for S_ISLNK() */
+#define _XOPEN_SOURCE_EXTENDED 1 /* AIX 5.3L needs this */
+#endif
+#define _ALL_SOURCE 1
+#define _GNU_SOURCE 1
+#define _BSD_SOURCE 1
+
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <sys/time.h>
+#include <time.h>
+#include <signal.h>
+#include <fnmatch.h>
+#include <assert.h>
+#include <regex.h>
+#include <utime.h>
+#ifndef __MINGW32__
+#include <sys/wait.h>
+#include <sys/poll.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#ifndef NO_SYS_SELECT_H
+#include <sys/select.h>
+#endif
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <pwd.h>
+#include <inttypes.h>
+#if defined(__CYGWIN__)
+#undef _XOPEN_SOURCE
+#include <grp.h>
+#define _XOPEN_SOURCE 600
+#include "compat/cygwin.h"
+#else
+#undef _ALL_SOURCE /* AIX 5.3L defines a struct list with _ALL_SOURCE. */
+#include <grp.h>
+#define _ALL_SOURCE 1
+#endif
+#else  /* __MINGW32__ */
+/* pull in Windows compatibility stuff */
+#include "compat/mingw.h"
+#endif /* __MINGW32__ */
+
+#ifndef NO_ICONV
+#include <iconv.h>
+#endif
+
+#ifndef NO_OPENSSL
+#include <openssl/ssl.h>
+#include <openssl/err.h>
+#endif
+
+/* On most systems <limits.h> would have given us this, but
+ * not on some systems (e.g. GNU/Hurd).
+ */
+#ifndef PATH_MAX
+#define PATH_MAX 4096
+#endif
+
+#ifndef PRIuMAX
+#define PRIuMAX "llu"
+#endif
+
+#ifndef PRIu32
+#define PRIu32 "u"
+#endif
+
+#ifndef PRIx32
+#define PRIx32 "x"
+#endif
+
+#ifndef PATH_SEP
+#define PATH_SEP ':'
+#endif
+
+#ifndef STRIP_EXTENSION
+#define STRIP_EXTENSION ""
+#endif
+
+#ifndef has_dos_drive_prefix
+#define has_dos_drive_prefix(path) 0
+#endif
+
+#ifndef is_dir_sep
+#define is_dir_sep(c) ((c) == '/')
+#endif
+
+#ifdef __GNUC__
+#define NORETURN __attribute__((__noreturn__))
+#else
+#define NORETURN
+#ifndef __attribute__
+#define __attribute__(x)
+#endif
+#endif
+
+/* General helper functions */
+extern void usage(const char *err) NORETURN;
+extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2)));
+extern int error(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
+
+extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN);
+
+extern int prefixcmp(const char *str, const char *prefix);
+extern time_t tm_to_time_t(const struct tm *tm);
+
+static inline const char *skip_prefix(const char *str, const char *prefix)
+{
+       size_t len = strlen(prefix);
+       return strncmp(str, prefix, len) ? NULL : str + len;
+}
+
+#if defined(NO_MMAP) || defined(USE_WIN32_MMAP)
+
+#ifndef PROT_READ
+#define PROT_READ 1
+#define PROT_WRITE 2
+#define MAP_PRIVATE 1
+#define MAP_FAILED ((void*)-1)
+#endif
+
+#define mmap git_mmap
+#define munmap git_munmap
+extern void *git_mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
+extern int git_munmap(void *start, size_t length);
+
+#else /* NO_MMAP || USE_WIN32_MMAP */
+
+#include <sys/mman.h>
+
+#endif /* NO_MMAP || USE_WIN32_MMAP */
+
+#ifdef NO_MMAP
+
+/* This value must be multiple of (pagesize * 2) */
+#define DEFAULT_PACKED_GIT_WINDOW_SIZE (1 * 1024 * 1024)
+
+#else /* NO_MMAP */
+
+/* This value must be multiple of (pagesize * 2) */
+#define DEFAULT_PACKED_GIT_WINDOW_SIZE \
+       (sizeof(void*) >= 8 \
+               ?  1 * 1024 * 1024 * 1024 \
+               : 32 * 1024 * 1024)
+
+#endif /* NO_MMAP */
+
+#ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
+#define on_disk_bytes(st) ((st).st_size)
+#else
+#define on_disk_bytes(st) ((st).st_blocks * 512)
+#endif
+
+#define DEFAULT_PACKED_GIT_LIMIT \
+       ((1024L * 1024L) * (sizeof(void*) >= 8 ? 8192 : 256))
+
+#ifdef NO_PREAD
+#define pread git_pread
+extern ssize_t git_pread(int fd, void *buf, size_t count, off_t offset);
+#endif
+/*
+ * Forward decl that will remind us if its twin in cache.h changes.
+ * This function is used in compat/pread.c.  But we can't include
+ * cache.h there.
+ */
+extern ssize_t read_in_full(int fd, void *buf, size_t count);
+
+#ifdef NO_SETENV
+#define setenv gitsetenv
+extern int gitsetenv(const char *, const char *, int);
+#endif
+
+#ifdef NO_MKDTEMP
+#define mkdtemp gitmkdtemp
+extern char *gitmkdtemp(char *);
+#endif
+
+#ifdef NO_UNSETENV
+#define unsetenv gitunsetenv
+extern void gitunsetenv(const char *);
+#endif
+
+#ifdef NO_STRCASESTR
+#define strcasestr gitstrcasestr
+extern char *gitstrcasestr(const char *haystack, const char *needle);
+#endif
+
+#ifdef NO_STRLCPY
+#define strlcpy gitstrlcpy
+extern size_t gitstrlcpy(char *, const char *, size_t);
+#endif
+
+#ifdef NO_STRTOUMAX
+#define strtoumax gitstrtoumax
+extern uintmax_t gitstrtoumax(const char *, char **, int);
+#endif
+
+#ifdef NO_HSTRERROR
+#define hstrerror githstrerror
+extern const char *githstrerror(int herror);
+#endif
+
+#ifdef NO_MEMMEM
+#define memmem gitmemmem
+void *gitmemmem(const void *haystack, size_t haystacklen,
+                const void *needle, size_t needlelen);
+#endif
+
+#ifdef FREAD_READS_DIRECTORIES
+#ifdef fopen
+#undef fopen
+#endif
+#define fopen(a,b) git_fopen(a,b)
+extern FILE *git_fopen(const char*, const char*);
+#endif
+
+#ifdef SNPRINTF_RETURNS_BOGUS
+#define snprintf git_snprintf
+extern int git_snprintf(char *str, size_t maxsize,
+                       const char *format, ...);
+#define vsnprintf git_vsnprintf
+extern int git_vsnprintf(char *str, size_t maxsize,
+                        const char *format, va_list ap);
+#endif
+
+#ifdef __GLIBC_PREREQ
+#if __GLIBC_PREREQ(2, 1)
+#define HAVE_STRCHRNUL
+#endif
+#endif
+
+#ifndef HAVE_STRCHRNUL
+#define strchrnul gitstrchrnul
+static inline char *gitstrchrnul(const char *s, int c)
+{
+       while (*s && *s != c)
+               s++;
+       return (char *)s;
+}
+#endif
+
+/*
+ * Wrappers:
+ */
+extern char *xstrdup(const char *str);
+extern void *xmalloc(size_t size);
+extern void *xmemdupz(const void *data, size_t len);
+extern char *xstrndup(const char *str, size_t len);
+extern void *xrealloc(void *ptr, size_t size);
+extern void *xcalloc(size_t nmemb, size_t size);
+extern void *xmmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
+extern ssize_t xread(int fd, void *buf, size_t len);
+extern ssize_t xwrite(int fd, const void *buf, size_t len);
+extern int xdup(int fd);
+extern FILE *xfdopen(int fd, const char *mode);
+extern int xmkstemp(char *template);
+
+static inline size_t xsize_t(off_t len)
+{
+       return (size_t)len;
+}
+
+static inline int has_extension(const char *filename, const char *ext)
+{
+       size_t len = strlen(filename);
+       size_t extlen = strlen(ext);
+       return len > extlen && !memcmp(filename + len - extlen, ext, extlen);
+}
+
+/* Sane ctype - no locale, and works with signed chars */
+#undef isascii
+#undef isspace
+#undef isdigit
+#undef isalpha
+#undef isalnum
+#undef tolower
+#undef toupper
+extern unsigned char sane_ctype[256];
+#define GIT_SPACE 0x01
+#define GIT_DIGIT 0x02
+#define GIT_ALPHA 0x04
+#define GIT_GLOB_SPECIAL 0x08
+#define GIT_REGEX_SPECIAL 0x10
+#define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
+#define isascii(x) (((x) & ~0x7f) == 0)
+#define isspace(x) sane_istest(x,GIT_SPACE)
+#define isdigit(x) sane_istest(x,GIT_DIGIT)
+#define isalpha(x) sane_istest(x,GIT_ALPHA)
+#define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
+#define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL)
+#define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL)
+#define tolower(x) sane_case((unsigned char)(x), 0x20)
+#define toupper(x) sane_case((unsigned char)(x), 0)
+
+static inline int sane_case(int x, int high)
+{
+       if (sane_istest(x, GIT_ALPHA))
+               x = (x & ~0x20) | high;
+       return x;
+}
+
+static inline int strtoul_ui(char const *s, int base, unsigned int *result)
+{
+       unsigned long ul;
+       char *p;
+
+       errno = 0;
+       ul = strtoul(s, &p, base);
+       if (errno || *p || p == s || (unsigned int) ul != ul)
+               return -1;
+       *result = ul;
+       return 0;
+}
+
+static inline int strtol_i(char const *s, int base, int *result)
+{
+       long ul;
+       char *p;
+
+       errno = 0;
+       ul = strtol(s, &p, base);
+       if (errno || *p || p == s || (int) ul != ul)
+               return -1;
+       *result = ul;
+       return 0;
+}
+
+#ifdef INTERNAL_QSORT
+void git_qsort(void *base, size_t nmemb, size_t size,
+              int(*compar)(const void *, const void *));
+#define qsort git_qsort
+#endif
+
+#ifndef DIR_HAS_BSD_GROUP_SEMANTICS
+# define FORCE_DIR_SET_GID S_ISGID
+#else
+# define FORCE_DIR_SET_GID 0
+#endif
+
+#ifdef NO_NSEC
+#undef USE_NSEC
+#define ST_CTIME_NSEC(st) 0
+#define ST_MTIME_NSEC(st) 0
+#else
+#ifdef USE_ST_TIMESPEC
+#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctimespec.tv_nsec))
+#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtimespec.tv_nsec))
+#else
+#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctim.tv_nsec))
+#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtim.tv_nsec))
+#endif
+#endif
+
+#endif
diff --git a/tools/perf/util/wrapper.c b/tools/perf/util/wrapper.c

new file mode 100644 (file)

index 0000000..6350d65
--- /dev/null
+++ b/tools/perf/util/wrapper.c
@@ -0,0 +1,206 @@
+/*
+ * Various trivial helper wrappers around standard functions
+ */
+#include "cache.h"
+
+/*
+ * There's no pack memory to release - but stay close to the Git
+ * version so wrap this away:
+ */
+static inline void release_pack_memory(size_t size, int flag)
+{
+}
+
+char *xstrdup(const char *str)
+{
+       char *ret = strdup(str);
+       if (!ret) {
+               release_pack_memory(strlen(str) + 1, -1);
+               ret = strdup(str);
+               if (!ret)
+                       die("Out of memory, strdup failed");
+       }
+       return ret;
+}
+
+void *xmalloc(size_t size)
+{
+       void *ret = malloc(size);
+       if (!ret && !size)
+               ret = malloc(1);
+       if (!ret) {
+               release_pack_memory(size, -1);
+               ret = malloc(size);
+               if (!ret && !size)
+                       ret = malloc(1);
+               if (!ret)
+                       die("Out of memory, malloc failed");
+       }
+#ifdef XMALLOC_POISON
+       memset(ret, 0xA5, size);
+#endif
+       return ret;
+}
+
+/*
+ * xmemdupz() allocates (len + 1) bytes of memory, duplicates "len" bytes of
+ * "data" to the allocated memory, zero terminates the allocated memory,
+ * and returns a pointer to the allocated memory. If the allocation fails,
+ * the program dies.
+ */
+void *xmemdupz(const void *data, size_t len)
+{
+       char *p = xmalloc(len + 1);
+       memcpy(p, data, len);
+       p[len] = '\0';
+       return p;
+}
+
+char *xstrndup(const char *str, size_t len)
+{
+       char *p = memchr(str, '\0', len);
+       return xmemdupz(str, p ? p - str : len);
+}
+
+void *xrealloc(void *ptr, size_t size)
+{
+       void *ret = realloc(ptr, size);
+       if (!ret && !size)
+               ret = realloc(ptr, 1);
+       if (!ret) {
+               release_pack_memory(size, -1);
+               ret = realloc(ptr, size);
+               if (!ret && !size)
+                       ret = realloc(ptr, 1);
+               if (!ret)
+                       die("Out of memory, realloc failed");
+       }
+       return ret;
+}
+
+void *xcalloc(size_t nmemb, size_t size)
+{
+       void *ret = calloc(nmemb, size);
+       if (!ret && (!nmemb || !size))
+               ret = calloc(1, 1);
+       if (!ret) {
+               release_pack_memory(nmemb * size, -1);
+               ret = calloc(nmemb, size);
+               if (!ret && (!nmemb || !size))
+                       ret = calloc(1, 1);
+               if (!ret)
+                       die("Out of memory, calloc failed");
+       }
+       return ret;
+}
+
+void *xmmap(void *start, size_t length,
+       int prot, int flags, int fd, off_t offset)
+{
+       void *ret = mmap(start, length, prot, flags, fd, offset);
+       if (ret == MAP_FAILED) {
+               if (!length)
+                       return NULL;
+               release_pack_memory(length, fd);
+               ret = mmap(start, length, prot, flags, fd, offset);
+               if (ret == MAP_FAILED)
+                       die("Out of memory? mmap failed: %s", strerror(errno));
+       }
+       return ret;
+}
+
+/*
+ * xread() is the same a read(), but it automatically restarts read()
+ * operations with a recoverable error (EAGAIN and EINTR). xread()
+ * DOES NOT GUARANTEE that "len" bytes is read even if the data is available.
+ */
+ssize_t xread(int fd, void *buf, size_t len)
+{
+       ssize_t nr;
+       while (1) {
+               nr = read(fd, buf, len);
+               if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
+                       continue;
+               return nr;
+       }
+}
+
+/*
+ * xwrite() is the same a write(), but it automatically restarts write()
+ * operations with a recoverable error (EAGAIN and EINTR). xwrite() DOES NOT
+ * GUARANTEE that "len" bytes is written even if the operation is successful.
+ */
+ssize_t xwrite(int fd, const void *buf, size_t len)
+{
+       ssize_t nr;
+       while (1) {
+               nr = write(fd, buf, len);
+               if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
+                       continue;
+               return nr;
+       }
+}
+
+ssize_t read_in_full(int fd, void *buf, size_t count)
+{
+       char *p = buf;
+       ssize_t total = 0;
+
+       while (count > 0) {
+               ssize_t loaded = xread(fd, p, count);
+               if (loaded <= 0)
+                       return total ? total : loaded;
+               count -= loaded;
+               p += loaded;
+               total += loaded;
+       }
+
+       return total;
+}
+
+ssize_t write_in_full(int fd, const void *buf, size_t count)
+{
+       const char *p = buf;
+       ssize_t total = 0;
+
+       while (count > 0) {
+               ssize_t written = xwrite(fd, p, count);
+               if (written < 0)
+                       return -1;
+               if (!written) {
+                       errno = ENOSPC;
+                       return -1;
+               }
+               count -= written;
+               p += written;
+               total += written;
+       }
+
+       return total;
+}
+
+int xdup(int fd)
+{
+       int ret = dup(fd);
+       if (ret < 0)
+               die("dup failed: %s", strerror(errno));
+       return ret;
+}
+
+FILE *xfdopen(int fd, const char *mode)
+{
+       FILE *stream = fdopen(fd, mode);
+       if (stream == NULL)
+               die("Out of memory? fdopen failed: %s", strerror(errno));
+       return stream;
+}
+
+int xmkstemp(char *template)
+{
+       int fd;
+
+       fd = mkstemp(template);
+       if (fd < 0)
+               die("Unable to create temporary file: %s", strerror(errno));
+       return fd;
+}
author	Robert Richter <robert.richter@amd.com>
	Fri, 12 Jun 2009 15:58:48 +0000 (17:58 +0200)
committer	Robert Richter <robert.richter@amd.com>
	Fri, 12 Jun 2009 15:58:48 +0000 (17:58 +0200)