]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
Merge tag 'v3.10' into next
authorBenjamin Herrenschmidt <benh@kernel.crashing.org>
Mon, 1 Jul 2013 07:57:25 +0000 (17:57 +1000)
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>
Mon, 1 Jul 2013 07:57:25 +0000 (17:57 +1000)
Merge 3.10 in order to get some of the last minute powerpc
changes, resolve conflicts and add additional fixes on top
of them.

154 files changed:
Documentation/powerpc/00-INDEX
Documentation/powerpc/pmu-ebb.txt [new file with mode: 0644]
Documentation/vfio.txt
MAINTAINERS
arch/powerpc/Kconfig
arch/powerpc/Kconfig.debug
arch/powerpc/boot/dts/currituck.dts
arch/powerpc/configs/c2k_defconfig
arch/powerpc/configs/g5_defconfig
arch/powerpc/configs/maple_defconfig
arch/powerpc/configs/pmac32_defconfig
arch/powerpc/configs/ppc64_defconfig
arch/powerpc/configs/ppc6xx_defconfig
arch/powerpc/configs/pseries_defconfig
arch/powerpc/include/asm/eeh.h
arch/powerpc/include/asm/eeh_event.h
arch/powerpc/include/asm/exception-64s.h
arch/powerpc/include/asm/hugetlb.h
arch/powerpc/include/asm/iommu.h
arch/powerpc/include/asm/kvm_book3s_64.h
arch/powerpc/include/asm/lppaca.h
arch/powerpc/include/asm/machdep.h
arch/powerpc/include/asm/mmu-hash64.h
arch/powerpc/include/asm/opal.h
arch/powerpc/include/asm/perf_event_server.h
arch/powerpc/include/asm/pgalloc-64.h
arch/powerpc/include/asm/pgtable-ppc64-64k.h
arch/powerpc/include/asm/pgtable-ppc64.h
arch/powerpc/include/asm/pgtable.h
arch/powerpc/include/asm/probes.h
arch/powerpc/include/asm/processor.h
arch/powerpc/include/asm/reg.h
arch/powerpc/include/asm/rtas.h
arch/powerpc/include/asm/switch_to.h
arch/powerpc/include/asm/tlbflush.h
arch/powerpc/include/asm/vdso.h
arch/powerpc/kernel/Makefile
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/cacheinfo.c
arch/powerpc/kernel/eeh.c [moved from arch/powerpc/platforms/pseries/eeh.c with 86% similarity]
arch/powerpc/kernel/eeh_cache.c [moved from arch/powerpc/platforms/pseries/eeh_cache.c with 99% similarity]
arch/powerpc/kernel/eeh_dev.c [moved from arch/powerpc/platforms/pseries/eeh_dev.c with 100% similarity]
arch/powerpc/kernel/eeh_driver.c [moved from arch/powerpc/platforms/pseries/eeh_driver.c with 81% similarity]
arch/powerpc/kernel/eeh_event.c [moved from arch/powerpc/platforms/pseries/eeh_event.c with 56% similarity]
arch/powerpc/kernel/eeh_pe.c [moved from arch/powerpc/platforms/pseries/eeh_pe.c with 75% similarity]
arch/powerpc/kernel/eeh_sysfs.c [moved from arch/powerpc/platforms/pseries/eeh_sysfs.c with 99% similarity]
arch/powerpc/kernel/entry_64.S
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/hw_breakpoint.c
arch/powerpc/kernel/idle.c
arch/powerpc/kernel/io-workarounds.c
arch/powerpc/kernel/iommu.c
arch/powerpc/kernel/irq.c
arch/powerpc/kernel/kprobes.c
arch/powerpc/kernel/nvram_64.c
arch/powerpc/kernel/pci-hotplug.c [new file with mode: 0644]
arch/powerpc/kernel/process.c
arch/powerpc/kernel/prom.c
arch/powerpc/kernel/ptrace.c
arch/powerpc/kernel/reloc_32.S
arch/powerpc/kernel/rtas.c
arch/powerpc/kernel/setup_64.c
arch/powerpc/kernel/signal_32.c
arch/powerpc/kernel/signal_64.c
arch/powerpc/kernel/smp.c
arch/powerpc/kernel/sysfs.c
arch/powerpc/kernel/time.c
arch/powerpc/kernel/tm.S
arch/powerpc/kernel/traps.c
arch/powerpc/kernel/udbg.c
arch/powerpc/kernel/vdso.c
arch/powerpc/kvm/book3s_64_mmu_host.c
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_hv_rm_mmu.c
arch/powerpc/lib/sstep.c
arch/powerpc/math-emu/Makefile
arch/powerpc/math-emu/fre.c [new file with mode: 0644]
arch/powerpc/math-emu/frsqrtes.c [new file with mode: 0644]
arch/powerpc/math-emu/math.c
arch/powerpc/mm/44x_mmu.c
arch/powerpc/mm/Makefile
arch/powerpc/mm/gup.c
arch/powerpc/mm/hash_low_64.S
arch/powerpc/mm/hash_native_64.c
arch/powerpc/mm/hash_utils_64.c
arch/powerpc/mm/hugepage-hash64.c [new file with mode: 0644]
arch/powerpc/mm/hugetlbpage-hash64.c
arch/powerpc/mm/hugetlbpage.c
arch/powerpc/mm/init_64.c
arch/powerpc/mm/mem.c
arch/powerpc/mm/mmap.c [moved from arch/powerpc/mm/mmap_64.c with 100% similarity]
arch/powerpc/mm/mmu_context_nohash.c
arch/powerpc/mm/numa.c
arch/powerpc/mm/pgtable.c
arch/powerpc/mm/pgtable_64.c
arch/powerpc/mm/subpage-prot.c
arch/powerpc/mm/tlb_hash64.c
arch/powerpc/mm/tlb_nohash.c
arch/powerpc/perf/core-book3s.c
arch/powerpc/perf/power8-pmu.c
arch/powerpc/platforms/44x/currituck.c
arch/powerpc/platforms/44x/iss4xx.c
arch/powerpc/platforms/85xx/smp.c
arch/powerpc/platforms/8xx/m8xx_setup.c
arch/powerpc/platforms/Kconfig
arch/powerpc/platforms/Kconfig.cputype
arch/powerpc/platforms/cell/beat_htab.c
arch/powerpc/platforms/cell/smp.c
arch/powerpc/platforms/powermac/smp.c
arch/powerpc/platforms/powernv/Makefile
arch/powerpc/platforms/powernv/eeh-ioda.c [new file with mode: 0644]
arch/powerpc/platforms/powernv/eeh-powernv.c [new file with mode: 0644]
arch/powerpc/platforms/powernv/opal-wrappers.S
arch/powerpc/platforms/powernv/opal.c
arch/powerpc/platforms/powernv/pci-ioda.c
arch/powerpc/platforms/powernv/pci-p5ioc2.c
arch/powerpc/platforms/powernv/pci.c
arch/powerpc/platforms/powernv/pci.h
arch/powerpc/platforms/powernv/setup.c
arch/powerpc/platforms/powernv/smp.c
arch/powerpc/platforms/ps3/htab.c
arch/powerpc/platforms/pseries/Kconfig
arch/powerpc/platforms/pseries/Makefile
arch/powerpc/platforms/pseries/io_event_irq.c
arch/powerpc/platforms/pseries/iommu.c
arch/powerpc/platforms/pseries/lpar.c
arch/powerpc/platforms/pseries/nvram.c
arch/powerpc/platforms/pseries/pci_dlpar.c
arch/powerpc/platforms/pseries/ras.c
arch/powerpc/platforms/pseries/smp.c
arch/powerpc/sysdev/cpm1.c
arch/s390/include/asm/pgtable.h
arch/s390/mm/pgtable.c
arch/sparc/include/asm/pgtable_64.h
arch/sparc/mm/tlb.c
drivers/iommu/Kconfig
drivers/macintosh/adb.c
drivers/macintosh/mac_hid.c
drivers/macintosh/via-cuda.c
drivers/macintosh/windfarm_pm121.c
drivers/macintosh/windfarm_pm81.c
drivers/macintosh/windfarm_pm91.c
drivers/macintosh/windfarm_smu_sat.c
drivers/vfio/Kconfig
drivers/vfio/Makefile
drivers/vfio/vfio.c
drivers/vfio/vfio_iommu_spapr_tce.c [new file with mode: 0644]
fs/pstore/inode.c
include/asm-generic/pgtable.h
include/linux/huge_mm.h
include/linux/pstore.h
include/uapi/linux/vfio.h
mm/huge_memory.c
mm/pgtable-generic.c

index dd9e92802ec09eee89b07e8b6b350b71fb44071b..05026ce1875e9969084166b04c60014c04ff863a 100644 (file)
@@ -14,6 +14,8 @@ hvcs.txt
        - IBM "Hypervisor Virtual Console Server" Installation Guide
 mpc52xx.txt
        - Linux 2.6.x on MPC52xx family
+pmu-ebb.txt
+       - Description of the API for using the PMU with Event Based Branches.
 qe_firmware.txt
        - describes the layout of firmware binaries for the Freescale QUICC
          Engine and the code that parses and uploads the microcode therein.
diff --git a/Documentation/powerpc/pmu-ebb.txt b/Documentation/powerpc/pmu-ebb.txt
new file mode 100644 (file)
index 0000000..73cd163
--- /dev/null
@@ -0,0 +1,137 @@
+PMU Event Based Branches
+========================
+
+Event Based Branches (EBBs) are a feature which allows the hardware to
+branch directly to a specified user space address when certain events occur.
+
+The full specification is available in Power ISA v2.07:
+
+  https://www.power.org/documentation/power-isa-version-2-07/
+
+One type of event for which EBBs can be configured is PMU exceptions. This
+document describes the API for configuring the Power PMU to generate EBBs,
+using the Linux perf_events API.
+
+
+Terminology
+-----------
+
+Throughout this document we will refer to an "EBB event" or "EBB events". This
+just refers to a struct perf_event which has set the "EBB" flag in its
+attr.config. All events which can be configured on the hardware PMU are
+possible "EBB events".
+
+
+Background
+----------
+
+When a PMU EBB occurs it is delivered to the currently running process. As such
+EBBs can only sensibly be used by programs for self-monitoring.
+
+It is a feature of the perf_events API that events can be created on other
+processes, subject to standard permission checks. This is also true of EBB
+events, however unless the target process enables EBBs (via mtspr(BESCR)) no
+EBBs will ever be delivered.
+
+This makes it possible for a process to enable EBBs for itself, but not
+actually configure any events. At a later time another process can come along
+and attach an EBB event to the process, which will then cause EBBs to be
+delivered to the first process. It's not clear if this is actually useful.
+
+
+When the PMU is configured for EBBs, all PMU interrupts are delivered to the
+user process. This means once an EBB event is scheduled on the PMU, no non-EBB
+events can be configured. This means that EBB events can not be run
+concurrently with regular 'perf' commands, or any other perf events.
+
+It is however safe to run 'perf' commands on a process which is using EBBs. The
+kernel will in general schedule the EBB event, and perf will be notified that
+its events could not run.
+
+The exclusion between EBB events and regular events is implemented using the
+existing "pinned" and "exclusive" attributes of perf_events. This means EBB
+events will be given priority over other events, unless they are also pinned.
+If an EBB event and a regular event are both pinned, then whichever is enabled
+first will be scheduled and the other will be put in error state. See the
+section below titled "Enabling an EBB event" for more information.
+
+
+Creating an EBB event
+---------------------
+
+To request that an event is counted using EBB, the event code should have bit
+63 set.
+
+EBB events must be created with a particular, and restrictive, set of
+attributes - this is so that they interoperate correctly with the rest of the
+perf_events subsystem.
+
+An EBB event must be created with the "pinned" and "exclusive" attributes set.
+Note that if you are creating a group of EBB events, only the leader can have
+these attributes set.
+
+An EBB event must NOT set any of the "inherit", "sample_period", "freq" or
+"enable_on_exec" attributes.
+
+An EBB event must be attached to a task. This is specified to perf_event_open()
+by passing a pid value, typically 0 indicating the current task.
+
+All events in a group must agree on whether they want EBB. That is all events
+must request EBB, or none may request EBB.
+
+EBB events must specify the PMC they are to be counted on. This ensures
+userspace is able to reliably determine which PMC the event is scheduled on.
+
+
+Enabling an EBB event
+---------------------
+
+Once an EBB event has been successfully opened, it must be enabled with the
+perf_events API. This can be achieved either via the ioctl() interface, or the
+prctl() interface.
+
+However, due to the design of the perf_events API, enabling an event does not
+guarantee that it has been scheduled on the PMU. To ensure that the EBB event
+has been scheduled on the PMU, you must perform a read() on the event. If the
+read() returns EOF, then the event has not been scheduled and EBBs are not
+enabled.
+
+This behaviour occurs because the EBB event is pinned and exclusive. When the
+EBB event is enabled it will force all other non-pinned events off the PMU. In
+this case the enable will be successful. However if there is already an event
+pinned on the PMU then the enable will not be successful.
+
+
+Reading an EBB event
+--------------------
+
+It is possible to read() from an EBB event. However the results are
+meaningless. Because interrupts are being delivered to the user process the
+kernel is not able to count the event, and so will return a junk value.
+
+
+Closing an EBB event
+--------------------
+
+When an EBB event is finished with, you can close it using close() as for any
+regular event. If this is the last EBB event the PMU will be deconfigured and
+no further PMU EBBs will be delivered.
+
+
+EBB Handler
+-----------
+
+The EBB handler is just regular userspace code, however it must be written in
+the style of an interrupt handler. When the handler is entered all registers
+are live (possibly) and so must be saved somehow before the handler can invoke
+other code.
+
+It's up to the program how to handle this. For C programs a relatively simple
+option is to create an interrupt frame on the stack and save registers there.
+
+Fork
+----
+
+EBB events are not inherited across fork. If the child process wishes to use
+EBBs it should open a new event for itself. Similarly the EBB state in
+BESCR/EBBHR/EBBRR is cleared across fork().
index 8eda3635a17da0e8d610a01e88d928a68f833f93..c55533c0adb39b5d3a243ba02a7d9a6c3c99489e 100644 (file)
@@ -283,6 +283,69 @@ a direct pass through for VFIO_DEVICE_* ioctls.  The read/write/mmap
 interfaces implement the device region access defined by the device's
 own VFIO_DEVICE_GET_REGION_INFO ioctl.
 
+
+PPC64 sPAPR implementation note
+-------------------------------------------------------------------------------
+
+This implementation has some specifics:
+
+1) Only one IOMMU group per container is supported as an IOMMU group
+represents the minimal entity which isolation can be guaranteed for and
+groups are allocated statically, one per a Partitionable Endpoint (PE)
+(PE is often a PCI domain but not always).
+
+2) The hardware supports so called DMA windows - the PCI address range
+within which DMA transfer is allowed, any attempt to access address space
+out of the window leads to the whole PE isolation.
+
+3) PPC64 guests are paravirtualized but not fully emulated. There is an API
+to map/unmap pages for DMA, and it normally maps 1..32 pages per call and
+currently there is no way to reduce the number of calls. In order to make things
+faster, the map/unmap handling has been implemented in real mode which provides
+an excellent performance which has limitations such as inability to do
+locked pages accounting in real time.
+
+So 3 additional ioctls have been added:
+
+       VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
+               of the DMA window on the PCI bus.
+
+       VFIO_IOMMU_ENABLE - enables the container. The locked pages accounting
+               is done at this point. This lets user first to know what
+               the DMA window is and adjust rlimit before doing any real job.
+
+       VFIO_IOMMU_DISABLE - disables the container.
+
+
+The code flow from the example above should be slightly changed:
+
+       .....
+       /* Add the group to the container */
+       ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
+
+       /* Enable the IOMMU model we want */
+       ioctl(container, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU)
+
+       /* Get addition sPAPR IOMMU info */
+       vfio_iommu_spapr_tce_info spapr_iommu_info;
+       ioctl(container, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &spapr_iommu_info);
+
+       if (ioctl(container, VFIO_IOMMU_ENABLE))
+               /* Cannot enable container, may be low rlimit */
+
+       /* Allocate some space and setup a DMA mapping */
+       dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE,
+                            MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+
+       dma_map.size = 1024 * 1024;
+       dma_map.iova = 0; /* 1MB starting at 0x0 from device view */
+       dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+       /* Check here is .iova/.size are within DMA window from spapr_iommu_info */
+
+       ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
+       .....
+
 -------------------------------------------------------------------------------
 
 [1] VFIO was originally an acronym for "Virtual Function I/O" in its
index ad7e322ad17b8bb985c73617407792fdc92eb33d..b888b1a4d247c5564e2cbb132a692078ca5e6f4f 100644 (file)
@@ -3103,6 +3103,13 @@ M:       Maxim Levitsky <maximlevitsky@gmail.com>
 S:     Maintained
 F:     drivers/media/rc/ene_ir.*
 
+ENHANCED ERROR HANDLING (EEH)
+M:     Gavin Shan <shangw@linux.vnet.ibm.com>
+L:     linuxppc-dev@lists.ozlabs.org
+S:     Supported
+F:     Documentation/powerpc/eeh-pci-error-recovery.txt
+F:     arch/powerpc/kernel/eeh*.c
+
 EPSON S1D13XXX FRAMEBUFFER DRIVER
 M:     Kristoffer Ericson <kristoffer.ericson@gmail.com>
 S:     Maintained
@@ -6149,7 +6156,6 @@ M:        Linas Vepstas <linasvepstas@gmail.com>
 L:     linux-pci@vger.kernel.org
 S:     Supported
 F:     Documentation/PCI/pci-error-recovery.txt
-F:     Documentation/powerpc/eeh-pci-error-recovery.txt
 
 PCI SUBSYSTEM
 M:     Bjorn Helgaas <bhelgaas@google.com>
index c33e3ad2c8fd52c9e0c31dfc272faf3d34902f37..5374776b4c7c12a18a847abec17023b9df7dc438 100644 (file)
@@ -298,7 +298,7 @@ config HUGETLB_PAGE_SIZE_VARIABLE
 
 config MATH_EMULATION
        bool "Math emulation"
-       depends on 4xx || 8xx || E200 || PPC_MPC832x || E500
+       depends on 4xx || 8xx || PPC_MPC832x || BOOKE
        ---help---
          Some PowerPC chips designed for embedded applications do not have
          a floating-point unit and therefore do not implement the
@@ -307,6 +307,10 @@ config MATH_EMULATION
          unit, which will allow programs that use floating-point
          instructions to run.
 
+         This is also useful to emulate missing (optional) instructions
+         such as fsqrt on cores that do have an FPU but do not implement
+         them (such as Freescale BookE).
+
 config PPC_TRANSACTIONAL_MEM
        bool "Transactional Memory support for POWERPC"
        depends on PPC_BOOK3S_64
@@ -315,17 +319,6 @@ config PPC_TRANSACTIONAL_MEM
        ---help---
          Support user-mode Transactional Memory on POWERPC.
 
-config 8XX_MINIMAL_FPEMU
-       bool "Minimal math emulation for 8xx"
-       depends on 8xx && !MATH_EMULATION
-       help
-         Older arch/ppc kernels still emulated a few floating point
-         instructions such as load and store, even when full math
-         emulation is disabled.  Say "Y" here if you want to preserve
-         this behavior.
-
-         It is recommended that you build a soft-float userspace instead.
-
 config IOMMU_HELPER
        def_bool PPC64
 
index 863d877e0b5f7444f31239a2412bdec68165abea..d86875f3e17e2a66a39a3ca3983ccae5b5ffa987 100644 (file)
@@ -147,6 +147,13 @@ choice
          enable debugging for the wrong type of machine your kernel
          _will not boot_.
 
+config PPC_EARLY_DEBUG_BOOTX
+       bool "BootX or OpenFirmware"
+       depends on BOOTX_TEXT
+       help
+         Select this to enable early debugging for a machine using BootX
+         or OpenFirmware.
+
 config PPC_EARLY_DEBUG_LPAR
        bool "LPAR HV Console"
        depends on PPC_PSERIES
index b801dd06e5733206a51469845a9b21a8a9f423de..d2c8a872308e64d29665b1f5aada5d533fd36c5c 100644 (file)
                                interrupts = <34 2>;
                        };
 
+                       FPGA0: fpga@50000000 {
+                               compatible = "ibm,currituck-fpga";
+                               reg = <0x50000000 0x4>;
+                       };
+
                        IIC0: i2c@00000000 {
                                compatible = "ibm,iic-currituck", "ibm,iic";
                                reg = <0x0 0x00000014>;
index 2a84fd7f631cf973fd0ce195312e5e1fb458c07a..671a8f960afa02b8d183a2c0eb069df9a791efc4 100644 (file)
@@ -423,6 +423,8 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_BOOTX_TEXT=y
+CONFIG_PPC_EARLY_DEBUG=y
+CONFIG_PPC_EARLY_DEBUG_BOOTX=y
 CONFIG_KEYS=y
 CONFIG_KEYS_DEBUG_PROC_KEYS=y
 CONFIG_SECURITY=y
index 07b7f2af2dca7ed9e06f7baea8e46f904fcb8aba..1ea22fc24ea8be8dc9bfa5492f3c80f42a61f886 100644 (file)
@@ -284,6 +284,8 @@ CONFIG_DEBUG_MUTEXES=y
 CONFIG_LATENCYTOP=y
 CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_BOOTX_TEXT=y
+CONFIG_PPC_EARLY_DEBUG=y
+CONFIG_PPC_EARLY_DEBUG_BOOTX=y
 CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_ECB=m
index 02ac96b679b8585a943b60faaf338aeed434c95b..2a5afac2986100115d2a84f889685ee6ed299045 100644 (file)
@@ -138,6 +138,8 @@ CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_XMON=y
 CONFIG_XMON_DEFAULT=y
 CONFIG_BOOTX_TEXT=y
+CONFIG_PPC_EARLY_DEBUG=y
+CONFIG_PPC_EARLY_DEBUG_BOOTX=y
 CONFIG_CRYPTO_ECB=m
 CONFIG_CRYPTO_PCBC=m
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
index 29767a8dfea5173a5cf852f005d20fa60b7998bd..a73626b0905173507abe8712a4916ab90295a17b 100644 (file)
@@ -350,6 +350,8 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_XMON=y
 CONFIG_XMON_DEFAULT=y
 CONFIG_BOOTX_TEXT=y
+CONFIG_PPC_EARLY_DEBUG=y
+CONFIG_PPC_EARLY_DEBUG_BOOTX=y
 CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_MD4=m
index aef3f71de5adf55a97e207fc7f201a93322c6187..c86fcb92358e27bc93bd723375d04c9e1c247071 100644 (file)
@@ -398,6 +398,8 @@ CONFIG_FTR_FIXUP_SELFTEST=y
 CONFIG_MSI_BITMAP_SELFTEST=y
 CONFIG_XMON=y
 CONFIG_BOOTX_TEXT=y
+CONFIG_PPC_EARLY_DEBUG=y
+CONFIG_PPC_EARLY_DEBUG_BOOTX=y
 CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_PCBC=m
index be1cb6ea3a36191e6e34e7b9d0bd046028e32060..20ebfaf7234b1976c3889cb5d32a6015c2e39dcf 100644 (file)
@@ -1264,6 +1264,8 @@ CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_XMON=y
 CONFIG_BOOTX_TEXT=y
+CONFIG_PPC_EARLY_DEBUG=y
+CONFIG_PPC_EARLY_DEBUG_BOOTX=y
 CONFIG_KEYS=y
 CONFIG_KEYS_DEBUG_PROC_KEYS=y
 CONFIG_SECURITY=y
index c4dfbaf8b19243987c7647ef0aef7147166e2bf4..bea8587c3af509b1540fe6138614f73dcbf97c9c 100644 (file)
@@ -296,6 +296,7 @@ CONFIG_SQUASHFS=m
 CONFIG_SQUASHFS_XATTR=y
 CONFIG_SQUASHFS_LZO=y
 CONFIG_SQUASHFS_XZ=y
+CONFIG_PSTORE=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3_ACL=y
 CONFIG_NFS_V4=y
index a80e32b46c118ee4166b590be84a6a2b898c0d0c..09a8743143f37a046e8d57e3d65f58f9a1bd3cb4 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/string.h>
+#include <linux/time.h>
 
 struct pci_dev;
 struct pci_bus;
@@ -52,6 +53,7 @@ struct device_node;
 
 #define EEH_PE_ISOLATED                (1 << 0)        /* Isolated PE          */
 #define EEH_PE_RECOVERING      (1 << 1)        /* Recovering PE        */
+#define EEH_PE_PHB_DEAD                (1 << 2)        /* Dead PHB             */
 
 struct eeh_pe {
        int type;                       /* PE type: PHB/Bus/Device      */
@@ -59,8 +61,10 @@ struct eeh_pe {
        int config_addr;                /* Traditional PCI address      */
        int addr;                       /* PE configuration address     */
        struct pci_controller *phb;     /* Associated PHB               */
+       struct pci_bus *bus;            /* Top PCI bus for bus PE       */
        int check_count;                /* Times of ignored error       */
        int freeze_count;               /* Times of froze up            */
+       struct timeval tstamp;          /* Time on first-time freeze    */
        int false_positives;            /* Times of reported #ff's      */
        struct eeh_pe *parent;          /* Parent PE                    */
        struct list_head child_list;    /* Link PE to the child list    */
@@ -95,12 +99,12 @@ struct eeh_dev {
 
 static inline struct device_node *eeh_dev_to_of_node(struct eeh_dev *edev)
 {
-       return edev->dn;
+       return edev ? edev->dn : NULL;
 }
 
 static inline struct pci_dev *eeh_dev_to_pci_dev(struct eeh_dev *edev)
 {
-       return edev->pdev;
+       return edev ? edev->pdev : NULL;
 }
 
 /*
@@ -130,8 +134,9 @@ static inline struct pci_dev *eeh_dev_to_pci_dev(struct eeh_dev *edev)
 struct eeh_ops {
        char *name;
        int (*init)(void);
+       int (*post_init)(void);
        void* (*of_probe)(struct device_node *dn, void *flag);
-       void* (*dev_probe)(struct pci_dev *dev, void *flag);
+       int (*dev_probe)(struct pci_dev *dev, void *flag);
        int (*set_option)(struct eeh_pe *pe, int option);
        int (*get_pe_addr)(struct eeh_pe *pe);
        int (*get_state)(struct eeh_pe *pe, int *state);
@@ -141,11 +146,12 @@ struct eeh_ops {
        int (*configure_bridge)(struct eeh_pe *pe);
        int (*read_config)(struct device_node *dn, int where, int size, u32 *val);
        int (*write_config)(struct device_node *dn, int where, int size, u32 val);
+       int (*next_error)(struct eeh_pe **pe);
 };
 
 extern struct eeh_ops *eeh_ops;
 extern int eeh_subsystem_enabled;
-extern struct mutex eeh_mutex;
+extern raw_spinlock_t confirm_error_lock;
 extern int eeh_probe_mode;
 
 #define EEH_PROBE_MODE_DEV     (1<<0)  /* From PCI device      */
@@ -166,14 +172,14 @@ static inline int eeh_probe_mode_dev(void)
        return (eeh_probe_mode == EEH_PROBE_MODE_DEV);
 }
 
-static inline void eeh_lock(void)
+static inline void eeh_serialize_lock(unsigned long *flags)
 {
-       mutex_lock(&eeh_mutex);
+       raw_spin_lock_irqsave(&confirm_error_lock, *flags);
 }
 
-static inline void eeh_unlock(void)
+static inline void eeh_serialize_unlock(unsigned long flags)
 {
-       mutex_unlock(&eeh_mutex);
+       raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
 }
 
 /*
@@ -184,8 +190,11 @@ static inline void eeh_unlock(void)
 
 typedef void *(*eeh_traverse_func)(void *data, void *flag);
 int eeh_phb_pe_create(struct pci_controller *phb);
+struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
+struct eeh_pe *eeh_pe_get(struct eeh_dev *edev);
 int eeh_add_to_parent_pe(struct eeh_dev *edev);
 int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe);
+void eeh_pe_update_time_stamp(struct eeh_pe *pe);
 void *eeh_pe_dev_traverse(struct eeh_pe *root,
                eeh_traverse_func fn, void *flag);
 void eeh_pe_restore_bars(struct eeh_pe *pe);
@@ -193,12 +202,13 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
 
 void *eeh_dev_init(struct device_node *dn, void *data);
 void eeh_dev_phb_init_dynamic(struct pci_controller *phb);
+int eeh_init(void);
 int __init eeh_ops_register(struct eeh_ops *ops);
 int __exit eeh_ops_unregister(const char *name);
 unsigned long eeh_check_failure(const volatile void __iomem *token,
                                unsigned long val);
 int eeh_dev_check_failure(struct eeh_dev *edev);
-void __init eeh_addr_cache_build(void);
+void eeh_addr_cache_build(void);
 void eeh_add_device_tree_early(struct device_node *);
 void eeh_add_device_tree_late(struct pci_bus *);
 void eeh_add_sysfs_files(struct pci_bus *);
@@ -221,6 +231,11 @@ void eeh_remove_bus_device(struct pci_dev *, int);
 
 #else /* !CONFIG_EEH */
 
+static inline int eeh_init(void)
+{
+       return 0;
+}
+
 static inline void *eeh_dev_init(struct device_node *dn, void *data)
 {
        return NULL;
@@ -245,9 +260,6 @@ static inline void eeh_add_sysfs_files(struct pci_bus *bus) { }
 
 static inline void eeh_remove_bus_device(struct pci_dev *dev, int purge_pe) { }
 
-static inline void eeh_lock(void) { }
-static inline void eeh_unlock(void) { }
-
 #define EEH_POSSIBLE_ERROR(val, type) (0)
 #define EEH_IO_ERROR_VALUE(size) (-1UL)
 #endif /* CONFIG_EEH */
index de67d830151be7f9cf3067a2a5768eeb48fed6af..89d5670b2eeb400ec659793fc3e960cc6d5894a3 100644 (file)
@@ -31,7 +31,9 @@ struct eeh_event {
        struct eeh_pe           *pe;    /* EEH PE               */
 };
 
+int eeh_event_init(void);
 int eeh_send_failure_event(struct eeh_pe *pe);
+void eeh_remove_event(struct eeh_pe *pe);
 void eeh_handle_event(struct eeh_pe *pe);
 
 #endif /* __KERNEL__ */
index 46793b58a761d549d7bf69814530c128352f26a5..07ca627e52c0b25ed45fb61128b3dc2210a4f541 100644 (file)
@@ -358,12 +358,12 @@ label##_relon_pSeries:                                    \
        /* No guest interrupts come through here */     \
        SET_SCRATCH0(r13);              /* save r13 */  \
        EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label##_common, \
-                                      EXC_STD, KVMTEST_PR, vec)
+                                      EXC_STD, NOTEST, vec)
 
 #define STD_RELON_EXCEPTION_PSERIES_OOL(vec, label)            \
        .globl label##_relon_pSeries;                           \
 label##_relon_pSeries:                                         \
-       EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_PR, vec);        \
+       EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, vec);            \
        EXCEPTION_RELON_PROLOG_PSERIES_1(label##_common, EXC_STD)
 
 #define STD_RELON_EXCEPTION_HV(loc, vec, label)                \
@@ -374,12 +374,12 @@ label##_relon_hv:                                 \
        /* No guest interrupts come through here */     \
        SET_SCRATCH0(r13);      /* save r13 */          \
        EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label##_common, \
-                                      EXC_HV, KVMTEST, vec)
+                                      EXC_HV, NOTEST, vec)
 
 #define STD_RELON_EXCEPTION_HV_OOL(vec, label)                 \
        .globl label##_relon_hv;                                \
 label##_relon_hv:                                              \
-       EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, vec);           \
+       EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, vec);            \
        EXCEPTION_RELON_PROLOG_PSERIES_1(label##_common, EXC_HV)
 
 /* This associate vector numbers with bits in paca->irq_happened */
index f2498c8e595d0b5c936e0a2981643d2323cc462c..d750336b171db4cf2c75f7e1fb95cbf6bcba8d74 100644 (file)
@@ -191,8 +191,14 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma,
                                      unsigned long vmaddr)
 {
 }
-#endif /* CONFIG_HUGETLB_PAGE */
 
+#define hugepd_shift(x) 0
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
+                                   unsigned pdshift)
+{
+       return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
 
 /*
  * FSL Book3E platforms require special gpage handling - the gpages
index cbfe678e3dbea1ac8b33a087d736e3d102a741a4..c34656a8925e2f8c3d64d42f7f4efd79dbe36872 100644 (file)
@@ -76,6 +76,9 @@ struct iommu_table {
        struct iommu_pool large_pool;
        struct iommu_pool pools[IOMMU_NR_POOLS];
        unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+       struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -98,6 +101,8 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
                                            int nid);
+extern void iommu_register_group(struct iommu_table *tbl,
+                                int pci_domain_number, unsigned long pe_num);
 
 extern int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
                        struct scatterlist *sglist, int nelems,
@@ -125,13 +130,6 @@ extern void iommu_init_early_pSeries(void);
 extern void iommu_init_early_dart(void);
 extern void iommu_init_early_pasemi(void);
 
-#ifdef CONFIG_PCI
-extern void pci_iommu_init(void);
-extern void pci_direct_iommu_init(void);
-#else
-static inline void pci_iommu_init(void) { }
-#endif
-
 extern void alloc_dart_table(void);
 #if defined(CONFIG_PPC64) && defined(CONFIG_PM)
 static inline void iommu_save(void)
@@ -147,5 +145,26 @@ static inline void iommu_restore(void)
 }
 #endif
 
+/* The API to support IOMMU operations for VFIO */
+extern int iommu_tce_clear_param_check(struct iommu_table *tbl,
+               unsigned long ioba, unsigned long tce_value,
+               unsigned long npages);
+extern int iommu_tce_put_param_check(struct iommu_table *tbl,
+               unsigned long ioba, unsigned long tce);
+extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
+               unsigned long hwaddr, enum dma_data_direction direction);
+extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
+               unsigned long entry);
+extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
+               unsigned long entry, unsigned long pages);
+extern int iommu_put_tce_user_mode(struct iommu_table *tbl,
+               unsigned long entry, unsigned long tce);
+
+extern void iommu_flush_tce(struct iommu_table *tbl);
+extern int iommu_take_ownership(struct iommu_table *tbl);
+extern void iommu_release_ownership(struct iommu_table *tbl);
+
+extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
index 9c1ff330c8053563545b9a0e7cfc72e62798365e..a1ecb14e4442d564fac6208e4bc445825beb0ff6 100644 (file)
@@ -159,36 +159,46 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type)
 }
 
 /*
- * Lock and read a linux PTE.  If it's present and writable, atomically
- * set dirty and referenced bits and return the PTE, otherwise return 0.
+ * If it's present and writable, atomically set dirty and referenced bits and
+ * return the PTE, otherwise return 0. If we find a transparent hugepage
+ * and if it is marked splitting we return 0;
  */
-static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int writing)
+static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing,
+                                                unsigned int hugepage)
 {
-       pte_t pte, tmp;
-
-       /* wait until _PAGE_BUSY is clear then set it atomically */
-       __asm__ __volatile__ (
-               "1:     ldarx   %0,0,%3\n"
-               "       andi.   %1,%0,%4\n"
-               "       bne-    1b\n"
-               "       ori     %1,%0,%4\n"
-               "       stdcx.  %1,0,%3\n"
-               "       bne-    1b"
-               : "=&r" (pte), "=&r" (tmp), "=m" (*p)
-               : "r" (p), "i" (_PAGE_BUSY)
-               : "cc");
-
-       if (pte_present(pte)) {
-               pte = pte_mkyoung(pte);
-               if (writing && pte_write(pte))
-                       pte = pte_mkdirty(pte);
-       }
+       pte_t old_pte, new_pte = __pte(0);
+
+       while (1) {
+               old_pte = pte_val(*ptep);
+               /*
+                * wait until _PAGE_BUSY is clear then set it atomically
+                */
+               if (unlikely(old_pte & _PAGE_BUSY)) {
+                       cpu_relax();
+                       continue;
+               }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+               /* If hugepage and is trans splitting return None */
+               if (unlikely(hugepage &&
+                            pmd_trans_splitting(pte_pmd(old_pte))))
+                       return __pte(0);
+#endif
+               /* If pte is not present return None */
+               if (unlikely(!(old_pte & _PAGE_PRESENT)))
+                       return __pte(0);
 
-       *p = pte;       /* clears _PAGE_BUSY */
+               new_pte = pte_mkyoung(old_pte);
+               if (writing && pte_write(old_pte))
+                       new_pte = pte_mkdirty(new_pte);
 
-       return pte;
+               if (old_pte == __cmpxchg_u64((unsigned long *)ptep, old_pte,
+                                            new_pte))
+                       break;
+       }
+       return new_pte;
 }
 
+
 /* Return HPTE cache control bits corresponding to Linux pte bits */
 static inline unsigned long hpte_cache_bits(unsigned long pte_val)
 {
index b1e7f2af1016c82e6584731a0d0112b80a5eb277..9b12f88d4adb4f0977c46bedf2f555b055f71955 100644 (file)
@@ -66,7 +66,8 @@ struct lppaca {
 
        u8      reserved6[48];
        u8      cede_latency_hint;
-       u8      reserved7[7];
+       u8      ebb_regs_in_use;
+       u8      reserved7[6];
        u8      dtl_enable_mask;        /* Dispatch Trace Log mask */
        u8      donate_dedicated_cpu;   /* Donate dedicated CPU cycles */
        u8      fpregs_in_use;
index 92386fc4e82a76ac04c2c4f6bfa053304bf2cf1d..8b480901165a5222634c60c81408651e3da76a38 100644 (file)
@@ -36,13 +36,13 @@ struct machdep_calls {
 #ifdef CONFIG_PPC64
        void            (*hpte_invalidate)(unsigned long slot,
                                           unsigned long vpn,
-                                          int psize, int ssize,
-                                          int local);
+                                          int bpsize, int apsize,
+                                          int ssize, int local);
        long            (*hpte_updatepp)(unsigned long slot, 
                                         unsigned long newpp, 
                                         unsigned long vpn,
-                                        int psize, int ssize,
-                                        int local);
+                                        int bpsize, int apsize,
+                                        int ssize, int local);
        void            (*hpte_updateboltedpp)(unsigned long newpp, 
                                               unsigned long ea,
                                               int psize, int ssize);
@@ -57,6 +57,9 @@ struct machdep_calls {
        void            (*hpte_removebolted)(unsigned long ea,
                                             int psize, int ssize);
        void            (*flush_hash_range)(unsigned long number, int local);
+       void            (*hugepage_invalidate)(struct mm_struct *mm,
+                                              unsigned char *hpte_slot_array,
+                                              unsigned long addr, int psize);
 
        /* special for kexec, to be called in real mode, linear mapping is
         * destroyed as well */
index 2accc9611248ff9a36357b9bcc9078f00743af4c..c4cf01197273f8a88203d35c34b1516e356040a4 100644 (file)
@@ -340,6 +340,20 @@ extern int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
                     pte_t *ptep, unsigned long trap, int local, int ssize,
                     unsigned int shift, unsigned int mmu_psize);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern int __hash_page_thp(unsigned long ea, unsigned long access,
+                          unsigned long vsid, pmd_t *pmdp, unsigned long trap,
+                          int local, int ssize, unsigned int psize);
+#else
+static inline int __hash_page_thp(unsigned long ea, unsigned long access,
+                                 unsigned long vsid, pmd_t *pmdp,
+                                 unsigned long trap, int local,
+                                 int ssize, unsigned int psize)
+{
+       BUG();
+       return -1;
+}
+#endif
 extern void hash_failure_debug(unsigned long ea, unsigned long access,
                               unsigned long vsid, unsigned long trap,
                               int ssize, int psize, int lpsize,
index cbb9305ab15affb2035f723be351e3a7431e2955..029fe85722aaac41a6c5ff0e0ed4c3ae4dd5cfc2 100644 (file)
@@ -117,7 +117,13 @@ extern int opal_enter_rtas(struct rtas_args *args,
 #define OPAL_SET_SLOT_LED_STATUS               55
 #define OPAL_GET_EPOW_STATUS                   56
 #define OPAL_SET_SYSTEM_ATTENTION_LED          57
+#define OPAL_RESERVED1                         58
+#define OPAL_RESERVED2                         59
+#define OPAL_PCI_NEXT_ERROR                    60
+#define OPAL_PCI_EEH_FREEZE_STATUS2            61
+#define OPAL_PCI_POLL                          62
 #define OPAL_PCI_MSI_EOI                       63
+#define OPAL_PCI_GET_PHB_DIAG_DATA2            64
 
 #ifndef __ASSEMBLY__
 
@@ -125,6 +131,7 @@ extern int opal_enter_rtas(struct rtas_args *args,
 enum OpalVendorApiTokens {
        OPAL_START_VENDOR_API_RANGE = 1000, OPAL_END_VENDOR_API_RANGE = 1999
 };
+
 enum OpalFreezeState {
        OPAL_EEH_STOPPED_NOT_FROZEN = 0,
        OPAL_EEH_STOPPED_MMIO_FREEZE = 1,
@@ -134,55 +141,69 @@ enum OpalFreezeState {
        OPAL_EEH_STOPPED_TEMP_UNAVAIL = 5,
        OPAL_EEH_STOPPED_PERM_UNAVAIL = 6
 };
+
 enum OpalEehFreezeActionToken {
        OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO = 1,
        OPAL_EEH_ACTION_CLEAR_FREEZE_DMA = 2,
        OPAL_EEH_ACTION_CLEAR_FREEZE_ALL = 3
 };
+
 enum OpalPciStatusToken {
-       OPAL_EEH_PHB_NO_ERROR = 0,
-       OPAL_EEH_PHB_FATAL = 1,
-       OPAL_EEH_PHB_RECOVERABLE = 2,
-       OPAL_EEH_PHB_BUS_ERROR = 3,
-       OPAL_EEH_PCI_NO_DEVSEL = 4,
-       OPAL_EEH_PCI_TA = 5,
-       OPAL_EEH_PCIEX_UR = 6,
-       OPAL_EEH_PCIEX_CA = 7,
-       OPAL_EEH_PCI_MMIO_ERROR = 8,
-       OPAL_EEH_PCI_DMA_ERROR = 9
+       OPAL_EEH_NO_ERROR       = 0,
+       OPAL_EEH_IOC_ERROR      = 1,
+       OPAL_EEH_PHB_ERROR      = 2,
+       OPAL_EEH_PE_ERROR       = 3,
+       OPAL_EEH_PE_MMIO_ERROR  = 4,
+       OPAL_EEH_PE_DMA_ERROR   = 5
 };
+
+enum OpalPciErrorSeverity {
+       OPAL_EEH_SEV_NO_ERROR   = 0,
+       OPAL_EEH_SEV_IOC_DEAD   = 1,
+       OPAL_EEH_SEV_PHB_DEAD   = 2,
+       OPAL_EEH_SEV_PHB_FENCED = 3,
+       OPAL_EEH_SEV_PE_ER      = 4,
+       OPAL_EEH_SEV_INF        = 5
+};
+
 enum OpalShpcAction {
        OPAL_SHPC_GET_LINK_STATE = 0,
        OPAL_SHPC_GET_SLOT_STATE = 1
 };
+
 enum OpalShpcLinkState {
        OPAL_SHPC_LINK_DOWN = 0,
        OPAL_SHPC_LINK_UP = 1
 };
+
 enum OpalMmioWindowType {
        OPAL_M32_WINDOW_TYPE = 1,
        OPAL_M64_WINDOW_TYPE = 2,
        OPAL_IO_WINDOW_TYPE = 3
 };
+
 enum OpalShpcSlotState {
        OPAL_SHPC_DEV_NOT_PRESENT = 0,
        OPAL_SHPC_DEV_PRESENT = 1
 };
+
 enum OpalExceptionHandler {
        OPAL_MACHINE_CHECK_HANDLER = 1,
        OPAL_HYPERVISOR_MAINTENANCE_HANDLER = 2,
        OPAL_SOFTPATCH_HANDLER = 3
 };
+
 enum OpalPendingState {
-       OPAL_EVENT_OPAL_INTERNAL = 0x1,
-       OPAL_EVENT_NVRAM = 0x2,
-       OPAL_EVENT_RTC = 0x4,
-       OPAL_EVENT_CONSOLE_OUTPUT = 0x8,
-       OPAL_EVENT_CONSOLE_INPUT = 0x10,
-       OPAL_EVENT_ERROR_LOG_AVAIL = 0x20,
-       OPAL_EVENT_ERROR_LOG = 0x40,
-       OPAL_EVENT_EPOW = 0x80,
-       OPAL_EVENT_LED_STATUS = 0x100
+       OPAL_EVENT_OPAL_INTERNAL        = 0x1,
+       OPAL_EVENT_NVRAM                = 0x2,
+       OPAL_EVENT_RTC                  = 0x4,
+       OPAL_EVENT_CONSOLE_OUTPUT       = 0x8,
+       OPAL_EVENT_CONSOLE_INPUT        = 0x10,
+       OPAL_EVENT_ERROR_LOG_AVAIL      = 0x20,
+       OPAL_EVENT_ERROR_LOG            = 0x40,
+       OPAL_EVENT_EPOW                 = 0x80,
+       OPAL_EVENT_LED_STATUS           = 0x100,
+       OPAL_EVENT_PCI_ERROR            = 0x200
 };
 
 /* Machine check related definitions */
@@ -364,15 +385,80 @@ struct opal_machine_check_event {
        } u;
 };
 
+enum {
+       OPAL_P7IOC_DIAG_TYPE_NONE       = 0,
+       OPAL_P7IOC_DIAG_TYPE_RGC        = 1,
+       OPAL_P7IOC_DIAG_TYPE_BI         = 2,
+       OPAL_P7IOC_DIAG_TYPE_CI         = 3,
+       OPAL_P7IOC_DIAG_TYPE_MISC       = 4,
+       OPAL_P7IOC_DIAG_TYPE_I2C        = 5,
+       OPAL_P7IOC_DIAG_TYPE_LAST       = 6
+};
+
+struct OpalIoP7IOCErrorData {
+       uint16_t type;
+
+       /* GEM */
+       uint64_t gemXfir;
+       uint64_t gemRfir;
+       uint64_t gemRirqfir;
+       uint64_t gemMask;
+       uint64_t gemRwof;
+
+       /* LEM */
+       uint64_t lemFir;
+       uint64_t lemErrMask;
+       uint64_t lemAction0;
+       uint64_t lemAction1;
+       uint64_t lemWof;
+
+       union {
+               struct OpalIoP7IOCRgcErrorData {
+                       uint64_t rgcStatus;             /* 3E1C10 */
+                       uint64_t rgcLdcp;               /* 3E1C18 */
+               }rgc;
+               struct OpalIoP7IOCBiErrorData {
+                       uint64_t biLdcp0;               /* 3C0100, 3C0118 */
+                       uint64_t biLdcp1;               /* 3C0108, 3C0120 */
+                       uint64_t biLdcp2;               /* 3C0110, 3C0128 */
+                       uint64_t biFenceStatus;         /* 3C0130, 3C0130 */
+
+                       uint8_t  biDownbound;           /* BI Downbound or Upbound */
+               }bi;
+               struct OpalIoP7IOCCiErrorData {
+                       uint64_t ciPortStatus;          /* 3Dn008 */
+                       uint64_t ciPortLdcp;            /* 3Dn010 */
+
+                       uint8_t  ciPort;                /* Index of CI port: 0/1 */
+               }ci;
+       };
+};
+
 /**
  * This structure defines the overlay which will be used to store PHB error
  * data upon request.
  */
+enum {
+       OPAL_PHB_ERROR_DATA_VERSION_1 = 1,
+};
+
+enum {
+       OPAL_PHB_ERROR_DATA_TYPE_P7IOC = 1,
+};
+
 enum {
        OPAL_P7IOC_NUM_PEST_REGS = 128,
 };
 
+struct OpalIoPhbErrorCommon {
+       uint32_t version;
+       uint32_t ioType;
+       uint32_t len;
+};
+
 struct OpalIoP7IOCPhbErrorData {
+       struct OpalIoPhbErrorCommon common;
+
        uint32_t brdgCtl;
 
        // P7IOC utl regs
@@ -530,14 +616,21 @@ int64_t opal_pci_map_pe_dma_window_real(uint64_t phb_id, uint16_t pe_number,
                                        uint64_t pci_mem_size);
 int64_t opal_pci_reset(uint64_t phb_id, uint8_t reset_scope, uint8_t assert_state);
 
-int64_t opal_pci_get_hub_diag_data(uint64_t hub_id, void *diag_buffer, uint64_t diag_buffer_len);
-int64_t opal_pci_get_phb_diag_data(uint64_t phb_id, void *diag_buffer, uint64_t diag_buffer_len);
+int64_t opal_pci_get_hub_diag_data(uint64_t hub_id, void *diag_buffer,
+                                  uint64_t diag_buffer_len);
+int64_t opal_pci_get_phb_diag_data(uint64_t phb_id, void *diag_buffer,
+                                  uint64_t diag_buffer_len);
+int64_t opal_pci_get_phb_diag_data2(uint64_t phb_id, void *diag_buffer,
+                                   uint64_t diag_buffer_len);
 int64_t opal_pci_fence_phb(uint64_t phb_id);
 int64_t opal_pci_reinit(uint64_t phb_id, uint8_t reinit_scope);
 int64_t opal_pci_mask_pe_error(uint64_t phb_id, uint16_t pe_number, uint8_t error_type, uint8_t mask_action);
 int64_t opal_set_slot_led_status(uint64_t phb_id, uint64_t slot_id, uint8_t led_type, uint8_t led_action);
 int64_t opal_get_epow_status(uint64_t *status);
 int64_t opal_set_system_attention_led(uint8_t led_action);
+int64_t opal_pci_next_error(uint64_t phb_id, uint64_t *first_frozen_pe,
+                           uint16_t *pci_error_type, uint16_t *severity);
+int64_t opal_pci_poll(uint64_t phb_id);
 
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data);
@@ -551,6 +644,11 @@ extern void hvc_opal_init_early(void);
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
                                   int depth, void *data);
 
+extern int opal_notifier_register(struct notifier_block *nb);
+extern void opal_notifier_enable(void);
+extern void opal_notifier_disable(void);
+extern void opal_notifier_update_evt(uint64_t evt_mask, uint64_t evt_val);
+
 extern int opal_get_chars(uint32_t vtermno, char *buf, int count);
 extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len);
 
index f265049dd7d6d464282e6e18cdd3fa66cc1234cf..2dd7bfc459bed73eba9f1be705acf44d7fb36c3d 100644 (file)
@@ -60,6 +60,7 @@ struct power_pmu {
 #define PPMU_HAS_SSLOT         0x00000020 /* Has sampled slot in MMCRA */
 #define PPMU_HAS_SIER          0x00000040 /* Has SIER */
 #define PPMU_BHRB              0x00000080 /* has BHRB feature enabled */
+#define PPMU_EBB               0x00000100 /* supports event based branch */
 
 /*
  * Values for flags to get_alternatives()
@@ -68,6 +69,11 @@ struct power_pmu {
 #define PPMU_LIMITED_PMC_REQD  2       /* have to put this on a limited PMC */
 #define PPMU_ONLY_COUNT_RUN    4       /* only counting in run state */
 
+/*
+ * We use the event config bit 63 as a flag to request EBB.
+ */
+#define EVENT_CONFIG_EBB_SHIFT 63
+
 extern int register_power_pmu(struct power_pmu *);
 
 struct pt_regs;
index b66ae722a8e9c2d9aacb9ef7eb49e553d9ad9dde..f65e27b09bd38010218b270ebcb27adb967c277b 100644 (file)
@@ -221,17 +221,17 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-       return kmem_cache_alloc(PGT_CACHE(PMD_INDEX_SIZE),
+       return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX),
                                GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
-       kmem_cache_free(PGT_CACHE(PMD_INDEX_SIZE), pmd);
+       kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd);
 }
 
 #define __pmd_free_tlb(tlb, pmd, addr)               \
-       pgtable_free_tlb(tlb, pmd, PMD_INDEX_SIZE)
+       pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX)
 #ifndef CONFIG_PPC_64K_PAGES
 #define __pud_free_tlb(tlb, pud, addr)               \
        pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE)
index 45142d640720697452db397e05a922f96083939d..a56b82fb0609b86102606c1232c42c2791595992 100644 (file)
@@ -33,7 +33,8 @@
 #define PGDIR_MASK     (~(PGDIR_SIZE-1))
 
 /* Bits to mask out from a PMD to get to the PTE page */
-#define PMD_MASKED_BITS                0x1ff
+/* PMDs point to PTE table fragments which are 4K aligned.  */
+#define PMD_MASKED_BITS                0xfff
 /* Bits to mask out from a PGD/PUD to get to the PMD page */
 #define PUD_MASKED_BITS                0x1ff
 
index e3d55f6f24fe1828ae3ce7603770fcbd6c7470f1..46db09414a1063415fccc9ef74b9f4b664de02f3 100644 (file)
@@ -10,6 +10,7 @@
 #else
 #include <asm/pgtable-ppc64-4k.h>
 #endif
+#include <asm/barrier.h>
 
 #define FIRST_USER_ADDRESS     0
 
                            PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT)
 #define PGTABLE_RANGE (ASM_CONST(1) << PGTABLE_EADDR_SIZE)
 
-
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define PMD_CACHE_INDEX        (PMD_INDEX_SIZE + 1)
+#else
+#define PMD_CACHE_INDEX        PMD_INDEX_SIZE
+#endif
 /*
  * Define the address range of the kernel non-linear virtual area
  */
 #define        pmd_present(pmd)        (pmd_val(pmd) != 0)
 #define        pmd_clear(pmdp)         (pmd_val(*(pmdp)) = 0)
 #define pmd_page_vaddr(pmd)    (pmd_val(pmd) & ~PMD_MASKED_BITS)
-#define pmd_page(pmd)          virt_to_page(pmd_page_vaddr(pmd))
+extern struct page *pmd_page(pmd_t pmd);
 
 #define pud_set(pudp, pudval)  (pud_val(*(pudp)) = (pudval))
 #define pud_none(pud)          (!pud_val(pud))
@@ -339,43 +344,217 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 
 void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
 void pgtable_cache_init(void);
+#endif /* __ASSEMBLY__ */
+
+/*
+ * THP pages can't be special. So use the _PAGE_SPECIAL
+ */
+#define _PAGE_SPLITTING _PAGE_SPECIAL
+
+/*
+ * We need to differentiate between explicit huge page and THP huge
+ * page, since THP huge page also need to track real subpage details
+ */
+#define _PAGE_THP_HUGE  _PAGE_4K_PFN
 
 /*
- * find_linux_pte returns the address of a linux pte for a given
- * effective address and directory.  If not found, it returns zero.
+ * set of bits not changed in pmd_modify.
  */
-static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea)
+#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS |              \
+                        _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
+                        _PAGE_THP_HUGE)
+
+#ifndef __ASSEMBLY__
+/*
+ * The linux hugepage PMD now include the pmd entries followed by the address
+ * to the stashed pgtable_t. The stashed pgtable_t contains the hpte bits.
+ * [ 1 bit secondary | 3 bit hidx | 1 bit valid | 000]. We use one byte per
+ * each HPTE entry. With 16MB hugepage and 64K HPTE we need 256 entries and
+ * with 4K HPTE we need 4096 entries. Both will fit in a 4K pgtable_t.
+ *
+ * The last three bits are intentionally left to zero. This memory location
+ * are also used as normal page PTE pointers. So if we have any pointers
+ * left around while we collapse a hugepage, we need to make sure
+ * _PAGE_PRESENT and _PAGE_FILE bits of that are zero when we look at them
+ */
+static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index)
 {
-       pgd_t *pg;
-       pud_t *pu;
-       pmd_t *pm;
-       pte_t *pt = NULL;
-
-       pg = pgdir + pgd_index(ea);
-       if (!pgd_none(*pg)) {
-               pu = pud_offset(pg, ea);
-               if (!pud_none(*pu)) {
-                       pm = pmd_offset(pu, ea);
-                       if (pmd_present(*pm))
-                               pt = pte_offset_kernel(pm, ea);
-               }
-       }
-       return pt;
+       return (hpte_slot_array[index] >> 3) & 0x1;
 }
 
-#ifdef CONFIG_HUGETLB_PAGE
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
-                                unsigned *shift);
-#else
-static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
-                                              unsigned *shift)
+static inline unsigned int hpte_hash_index(unsigned char *hpte_slot_array,
+                                          int index)
 {
-       if (shift)
-               *shift = 0;
-       return find_linux_pte(pgdir, ea);
+       return hpte_slot_array[index] >> 4;
 }
-#endif /* !CONFIG_HUGETLB_PAGE */
 
-#endif /* __ASSEMBLY__ */
+static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array,
+                                       unsigned int index, unsigned int hidx)
+{
+       hpte_slot_array[index] = hidx << 4 | 0x1 << 3;
+}
 
+static inline char *get_hpte_slot_array(pmd_t *pmdp)
+{
+       /*
+        * The hpte hindex is stored in the pgtable whose address is in the
+        * second half of the PMD
+        *
+        * Order this load with the test for pmd_trans_huge in the caller
+        */
+       smp_rmb();
+       return *(char **)(pmdp + PTRS_PER_PMD);
+
+
+}
+
+extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+                                  pmd_t *pmdp);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
+extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
+extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
+extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+                      pmd_t *pmdp, pmd_t pmd);
+extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+                                pmd_t *pmd);
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+       /*
+        * leaf pte for huge page, bottom two bits != 00
+        */
+       return (pmd_val(pmd) & 0x3) && (pmd_val(pmd) & _PAGE_THP_HUGE);
+}
+
+static inline int pmd_large(pmd_t pmd)
+{
+       /*
+        * leaf pte for huge page, bottom two bits != 00
+        */
+       if (pmd_trans_huge(pmd))
+               return pmd_val(pmd) & _PAGE_PRESENT;
+       return 0;
+}
+
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+       if (pmd_trans_huge(pmd))
+               return pmd_val(pmd) & _PAGE_SPLITTING;
+       return 0;
+}
+
+extern int has_transparent_hugepage(void);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static inline pte_t pmd_pte(pmd_t pmd)
+{
+       return __pte(pmd_val(pmd));
+}
+
+static inline pmd_t pte_pmd(pte_t pte)
+{
+       return __pmd(pte_val(pte));
+}
+
+static inline pte_t *pmdp_ptep(pmd_t *pmd)
+{
+       return (pte_t *)pmd;
+}
+
+#define pmd_pfn(pmd)           pte_pfn(pmd_pte(pmd))
+#define pmd_young(pmd)         pte_young(pmd_pte(pmd))
+#define pmd_mkold(pmd)         pte_pmd(pte_mkold(pmd_pte(pmd)))
+#define pmd_wrprotect(pmd)     pte_pmd(pte_wrprotect(pmd_pte(pmd)))
+#define pmd_mkdirty(pmd)       pte_pmd(pte_mkdirty(pmd_pte(pmd)))
+#define pmd_mkyoung(pmd)       pte_pmd(pte_mkyoung(pmd_pte(pmd)))
+#define pmd_mkwrite(pmd)       pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+
+#define __HAVE_ARCH_PMD_WRITE
+#define pmd_write(pmd)         pte_write(pmd_pte(pmd))
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+       /* Do nothing, mk_pmd() does this part.  */
+       return pmd;
+}
+
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+       pmd_val(pmd) &= ~_PAGE_PRESENT;
+       return pmd;
+}
+
+static inline pmd_t pmd_mksplitting(pmd_t pmd)
+{
+       pmd_val(pmd) |= _PAGE_SPLITTING;
+       return pmd;
+}
+
+#define __HAVE_ARCH_PMD_SAME
+static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
+{
+       return (((pmd_val(pmd_a) ^ pmd_val(pmd_b)) & ~_PAGE_HPTEFLAGS) == 0);
+}
+
+#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+                                unsigned long address, pmd_t *pmdp,
+                                pmd_t entry, int dirty);
+
+extern unsigned long pmd_hugepage_update(struct mm_struct *mm,
+                                        unsigned long addr,
+                                        pmd_t *pmdp, unsigned long clr);
+
+static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
+                                             unsigned long addr, pmd_t *pmdp)
+{
+       unsigned long old;
+
+       if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
+               return 0;
+       old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED);
+       return ((old & _PAGE_ACCESSED) != 0);
+}
+
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                                    unsigned long address, pmd_t *pmdp);
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
+                                 unsigned long address, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+extern pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+                               unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_CLEAR_FLUSH
+extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+                             pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
+                                     pmd_t *pmdp)
+{
+
+       if ((pmd_val(*pmdp) & _PAGE_RW) == 0)
+               return;
+
+       pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW);
+}
+
+#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+extern void pmdp_splitting_flush(struct vm_area_struct *vma,
+                                unsigned long address, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PGTABLE_DEPOSIT
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+                                      pgtable_t pgtable);
+#define __HAVE_ARCH_PGTABLE_WITHDRAW
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_INVALIDATE
+extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+                           pmd_t *pmdp);
+#endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */
index 7aeb9555f6eac40a69cfb11bddf924b221add6ee..959d575c37dd9bde980c702f2d111036dc6a2513 100644 (file)
@@ -220,6 +220,12 @@ extern int gup_hugepd(hugepd_t *hugepd, unsigned pdshift, unsigned long addr,
 
 extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
                       unsigned long end, int write, struct page **pages, int *nr);
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+#define pmd_large(pmd)         0
+#define has_transparent_hugepage() 0
+#endif
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+                                unsigned *shift);
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
index 5f1e15b68704e6b001e7395b57e6014992888abb..3421637cfd7b19f049a4fc7f2a41353a94c7d548 100644 (file)
@@ -38,5 +38,30 @@ typedef u32 ppc_opcode_t;
 #define is_trap(instr)         (IS_TW(instr) || IS_TWI(instr))
 #endif /* CONFIG_PPC64 */
 
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+#define MSR_SINGLESTEP (MSR_DE)
+#else
+#define MSR_SINGLESTEP (MSR_SE)
+#endif
+
+/* Enable single stepping for the current task */
+static inline void enable_single_step(struct pt_regs *regs)
+{
+       regs->msr |= MSR_SINGLESTEP;
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+       /*
+        * We turn off Critical Input Exception(CE) to ensure that the single
+        * step will be for the instruction we have the probe on; if we don't,
+        * it is possible we'd get the single step reported for CE.
+        */
+       regs->msr &= ~MSR_CE;
+       mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM);
+#ifdef CONFIG_PPC_47x
+       isync();
+#endif
+#endif
+}
+
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_PROBES_H */
index 14a658363698ee1d58e0b400b33b3bbc13475dda..47a35b08b9635ce375e266a6c12fd3b02869302b 100644 (file)
@@ -168,10 +168,10 @@ struct thread_struct {
         * The following help to manage the use of Debug Control Registers
         * om the BookE platforms.
         */
-       unsigned long   dbcr0;
-       unsigned long   dbcr1;
+       uint32_t        dbcr0;
+       uint32_t        dbcr1;
 #ifdef CONFIG_BOOKE
-       unsigned long   dbcr2;
+       uint32_t        dbcr2;
 #endif
        /*
         * The stored value of the DBSR register will be the value at the
@@ -179,7 +179,7 @@ struct thread_struct {
         * user (will never be written to) and has value while helping to
         * describe the reason for the last debug trap.  Torez
         */
-       unsigned long   dbsr;
+       uint32_t        dbsr;
        /*
         * The following will contain addresses used by debug applications
         * to help trace and trap on particular address locations.
@@ -200,7 +200,7 @@ struct thread_struct {
 #endif
 #endif
        /* FP and VSX 0-31 register set */
-       double          fpr[32][TS_FPRWIDTH];
+       double          fpr[32][TS_FPRWIDTH] __attribute__((aligned(16)));
        struct {
 
                unsigned int pad;
@@ -287,9 +287,9 @@ struct thread_struct {
        unsigned long   siar;
        unsigned long   sdar;
        unsigned long   sier;
-       unsigned long   mmcr0;
        unsigned long   mmcr2;
-       unsigned long   mmcra;
+       unsigned        mmcr0;
+       unsigned        used_ebb;
 #endif
 };
 
@@ -404,9 +404,7 @@ static inline void prefetchw(const void *x)
 
 #define spin_lock_prefetch(x)  prefetchw(x)
 
-#ifdef CONFIG_PPC64
 #define HAVE_ARCH_PICK_MMAP_LAYOUT
-#endif
 
 #ifdef CONFIG_PPC64
 static inline unsigned long get_clean_sp(unsigned long sp, int is_32)
index 4a9e408644fe6ae0403d49821d48516092e51640..5d7d9c2a547373dbb692ebded8b519df476bb2e4 100644 (file)
 #define   MMCR0_PMXE   0x04000000UL /* performance monitor exception enable */
 #define   MMCR0_FCECE  0x02000000UL /* freeze ctrs on enabled cond or event */
 #define   MMCR0_TBEE   0x00400000UL /* time base exception enable */
+#define   MMCR0_EBE    0x00100000UL /* Event based branch enable */
+#define   MMCR0_PMCC   0x000c0000UL /* PMC control */
+#define   MMCR0_PMCC_U6        0x00080000UL /* PMC1-6 are R/W by user (PR) */
 #define   MMCR0_PMC1CE 0x00008000UL /* PMC1 count enable*/
 #define   MMCR0_PMCjCE 0x00004000UL /* PMCj count enable*/
 #define   MMCR0_TRIGGER        0x00002000UL /* TRIGGER enable */
 #define   MMCR0_PMAO   0x00000080UL /* performance monitor alert has occurred, set to 0 after handling exception */
 #define   MMCR0_SHRFC  0x00000040UL /* SHRre freeze conditions between threads */
+#define   MMCR0_FC56   0x00000010UL /* freeze counters 5 and 6 */
 #define   MMCR0_FCTI   0x00000008UL /* freeze counters in tags inactive mode */
 #define   MMCR0_FCTA   0x00000004UL /* freeze counters in tags active mode */
 #define   MMCR0_FCWAIT 0x00000002UL /* freeze counter in WAIT state */
 #define   SIER_SIAR_VALID      0x0400000       /* SIAR contents valid */
 #define   SIER_SDAR_VALID      0x0200000       /* SDAR contents valid */
 
+/* When EBB is enabled, some of MMCR0/MMCR2/SIER are user accessible */
+#define MMCR0_USER_MASK        (MMCR0_FC | MMCR0_PMXE | MMCR0_PMAO)
+#define MMCR2_USER_MASK        0x4020100804020000UL /* (FC1P|FC2P|FC3P|FC4P|FC5P|FC6P) */
+#define SIER_USER_MASK 0x7fffffUL
+
 #define SPRN_PA6T_MMCR0 795
 #define   PA6T_MMCR0_EN0       0x0000000000000001UL
 #define   PA6T_MMCR0_EN1       0x0000000000000002UL
index 34fd70488d83f09ebb947680763ddbcf4ce62fad..c7a8bfc9f6f5e4c3551fef8dc5e3e69d045153e6 100644 (file)
@@ -350,8 +350,8 @@ static inline u32 rtas_config_addr(int busno, int devfn, int reg)
                        (devfn << 8) | (reg & 0xff);
 }
 
-extern void __cpuinit rtas_give_timebase(void);
-extern void __cpuinit rtas_take_timebase(void);
+extern void rtas_give_timebase(void);
+extern void rtas_take_timebase(void);
 
 #ifdef CONFIG_PPC_RTAS
 static inline int page_is_rtas_user_buf(unsigned long pfn)
index 200d763a0a6708b16674eaa92df29a3fcf57fce9..49a13e0ef2344fb83c6dd89ed4ab8fd179d34c01 100644 (file)
@@ -67,4 +67,18 @@ static inline void flush_spe_to_thread(struct task_struct *t)
 }
 #endif
 
+static inline void clear_task_ebb(struct task_struct *t)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+    /* EBB perf events are not inherited, so clear all EBB state. */
+    t->thread.bescr = 0;
+    t->thread.mmcr2 = 0;
+    t->thread.mmcr0 = 0;
+    t->thread.siar = 0;
+    t->thread.sdar = 0;
+    t->thread.sier = 0;
+    t->thread.used_ebb = 0;
+#endif
+}
+
 #endif /* _ASM_POWERPC_SWITCH_TO_H */
index 61a59271665b4b7e1b9d9a794eb67680c2ed1107..2def01ed0cb296ad48296366b4eb8a8dfb9e0da0 100644 (file)
@@ -165,7 +165,8 @@ static inline void flush_tlb_kernel_range(unsigned long start,
 /* Private function for use by PCI IO mapping code */
 extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
                                     unsigned long end);
-
+extern void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd,
+                               unsigned long addr);
 #else
 #error Unsupported MMU type
 #endif
index 50f261bc3e9595a1ad972a63af1f14073710c105..0d9cecddf8a4f5ae133415f5e0bd49d780318e01 100644 (file)
@@ -22,7 +22,7 @@ extern unsigned long vdso64_rt_sigtramp;
 extern unsigned long vdso32_sigtramp;
 extern unsigned long vdso32_rt_sigtramp;
 
-int __cpuinit vdso_getcpu_init(void);
+int vdso_getcpu_init(void);
 
 #else /* __ASSEMBLY__ */
 
index f960a7944553a2ca702eb7713cf13639564ffcfe..a8619bfe879e0168e5dbaca852ccfae3c176b907 100644 (file)
@@ -58,6 +58,8 @@ obj-$(CONFIG_RTAS_PROC)               += rtas-proc.o
 obj-$(CONFIG_LPARCFG)          += lparcfg.o
 obj-$(CONFIG_IBMVIO)           += vio.o
 obj-$(CONFIG_IBMEBUS)           += ibmebus.o
+obj-$(CONFIG_EEH)              += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \
+                                 eeh_driver.o eeh_event.o eeh_sysfs.o
 obj-$(CONFIG_GENERIC_TBSYNC)   += smp-tbsync.o
 obj-$(CONFIG_CRASH_DUMP)       += crash_dump.o
 obj-$(CONFIG_FA_DUMP)          += fadump.o
@@ -100,7 +102,7 @@ obj-$(CONFIG_PPC_UDBG_16550)        += legacy_serial.o udbg_16550.o
 obj-$(CONFIG_STACKTRACE)       += stacktrace.o
 obj-$(CONFIG_SWIOTLB)          += dma-swiotlb.o
 
-pci64-$(CONFIG_PPC64)          += pci_dn.o isa-bridge.o
+pci64-$(CONFIG_PPC64)          += pci_dn.o pci-hotplug.o isa-bridge.o
 obj-$(CONFIG_PCI)              += pci_$(CONFIG_WORD_SIZE).o $(pci64-y) \
                                   pci-common.o pci_of_scan.o
 obj-$(CONFIG_PCI_MSI)          += msi.o
index 6f16ffafa6f01542d54ccc684c3f8d5d7d6add5c..c7e8afc2ead0cde5d75b68244990092a0dea5bc3 100644 (file)
@@ -105,9 +105,6 @@ int main(void)
        DEFINE(KSP_VSID, offsetof(struct thread_struct, ksp_vsid));
 #else /* CONFIG_PPC64 */
        DEFINE(PGDIR, offsetof(struct thread_struct, pgdir));
-#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
-       DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, dbcr0));
-#endif
 #ifdef CONFIG_SPE
        DEFINE(THREAD_EVR0, offsetof(struct thread_struct, evr[0]));
        DEFINE(THREAD_ACC, offsetof(struct thread_struct, acc));
@@ -115,6 +112,9 @@ int main(void)
        DEFINE(THREAD_USED_SPE, offsetof(struct thread_struct, used_spe));
 #endif /* CONFIG_SPE */
 #endif /* CONFIG_PPC64 */
+#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
+       DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, dbcr0));
+#endif
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
        DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu));
 #endif
@@ -132,7 +132,6 @@ int main(void)
        DEFINE(THREAD_SIER, offsetof(struct thread_struct, sier));
        DEFINE(THREAD_MMCR0, offsetof(struct thread_struct, mmcr0));
        DEFINE(THREAD_MMCR2, offsetof(struct thread_struct, mmcr2));
-       DEFINE(THREAD_MMCRA, offsetof(struct thread_struct, mmcra));
 #endif
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
        DEFINE(PACATMSCRATCH, offsetof(struct paca_struct, tm_scratch));
index 92c6b008dd2b6ea28514318b5d51edf8423c96db..9262cf2bec4bd6e1e2edd7d91d476e4b900daeb3 100644 (file)
@@ -131,7 +131,8 @@ static const char *cache_type_string(const struct cache *cache)
        return cache_type_info[cache->type].name;
 }
 
-static void __cpuinit cache_init(struct cache *cache, int type, int level, struct device_node *ofnode)
+static void cache_init(struct cache *cache, int type, int level,
+                      struct device_node *ofnode)
 {
        cache->type = type;
        cache->level = level;
@@ -140,7 +141,7 @@ static void __cpuinit cache_init(struct cache *cache, int type, int level, struc
        list_add(&cache->list, &cache_list);
 }
 
-static struct cache *__cpuinit new_cache(int type, int level, struct device_node *ofnode)
+static struct cache *new_cache(int type, int level, struct device_node *ofnode)
 {
        struct cache *cache;
 
@@ -324,7 +325,8 @@ static bool cache_node_is_unified(const struct device_node *np)
        return of_get_property(np, "cache-unified", NULL);
 }
 
-static struct cache *__cpuinit cache_do_one_devnode_unified(struct device_node *node, int level)
+static struct cache *cache_do_one_devnode_unified(struct device_node *node,
+                                                 int level)
 {
        struct cache *cache;
 
@@ -335,7 +337,8 @@ static struct cache *__cpuinit cache_do_one_devnode_unified(struct device_node *
        return cache;
 }
 
-static struct cache *__cpuinit cache_do_one_devnode_split(struct device_node *node, int level)
+static struct cache *cache_do_one_devnode_split(struct device_node *node,
+                                               int level)
 {
        struct cache *dcache, *icache;
 
@@ -357,7 +360,7 @@ err:
        return NULL;
 }
 
-static struct cache *__cpuinit cache_do_one_devnode(struct device_node *node, int level)
+static struct cache *cache_do_one_devnode(struct device_node *node, int level)
 {
        struct cache *cache;
 
@@ -369,7 +372,8 @@ static struct cache *__cpuinit cache_do_one_devnode(struct device_node *node, in
        return cache;
 }
 
-static struct cache *__cpuinit cache_lookup_or_instantiate(struct device_node *node, int level)
+static struct cache *cache_lookup_or_instantiate(struct device_node *node,
+                                                int level)
 {
        struct cache *cache;
 
@@ -385,7 +389,7 @@ static struct cache *__cpuinit cache_lookup_or_instantiate(struct device_node *n
        return cache;
 }
 
-static void __cpuinit link_cache_lists(struct cache *smaller, struct cache *bigger)
+static void link_cache_lists(struct cache *smaller, struct cache *bigger)
 {
        while (smaller->next_local) {
                if (smaller->next_local == bigger)
@@ -396,13 +400,13 @@ static void __cpuinit link_cache_lists(struct cache *smaller, struct cache *bigg
        smaller->next_local = bigger;
 }
 
-static void __cpuinit do_subsidiary_caches_debugcheck(struct cache *cache)
+static void do_subsidiary_caches_debugcheck(struct cache *cache)
 {
        WARN_ON_ONCE(cache->level != 1);
        WARN_ON_ONCE(strcmp(cache->ofnode->type, "cpu"));
 }
 
-static void __cpuinit do_subsidiary_caches(struct cache *cache)
+static void do_subsidiary_caches(struct cache *cache)
 {
        struct device_node *subcache_node;
        int level = cache->level;
@@ -423,7 +427,7 @@ static void __cpuinit do_subsidiary_caches(struct cache *cache)
        }
 }
 
-static struct cache *__cpuinit cache_chain_instantiate(unsigned int cpu_id)
+static struct cache *cache_chain_instantiate(unsigned int cpu_id)
 {
        struct device_node *cpu_node;
        struct cache *cpu_cache = NULL;
@@ -448,7 +452,7 @@ out:
        return cpu_cache;
 }
 
-static struct cache_dir *__cpuinit cacheinfo_create_cache_dir(unsigned int cpu_id)
+static struct cache_dir *cacheinfo_create_cache_dir(unsigned int cpu_id)
 {
        struct cache_dir *cache_dir;
        struct device *dev;
@@ -653,7 +657,7 @@ static struct kobj_type cache_index_type = {
        .default_attrs = cache_index_default_attrs,
 };
 
-static void __cpuinit cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir)
+static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir)
 {
        const char *cache_name;
        const char *cache_type;
@@ -696,7 +700,8 @@ static void __cpuinit cacheinfo_create_index_opt_attrs(struct cache_index_dir *d
        kfree(buf);
 }
 
-static void __cpuinit cacheinfo_create_index_dir(struct cache *cache, int index, struct cache_dir *cache_dir)
+static void cacheinfo_create_index_dir(struct cache *cache, int index,
+                                      struct cache_dir *cache_dir)
 {
        struct cache_index_dir *index_dir;
        int rc;
@@ -722,7 +727,8 @@ err:
        kfree(index_dir);
 }
 
-static void __cpuinit cacheinfo_sysfs_populate(unsigned int cpu_id, struct cache *cache_list)
+static void cacheinfo_sysfs_populate(unsigned int cpu_id,
+                                    struct cache *cache_list)
 {
        struct cache_dir *cache_dir;
        struct cache *cache;
@@ -740,7 +746,7 @@ static void __cpuinit cacheinfo_sysfs_populate(unsigned int cpu_id, struct cache
        }
 }
 
-void __cpuinit cacheinfo_cpu_online(unsigned int cpu_id)
+void cacheinfo_cpu_online(unsigned int cpu_id)
 {
        struct cache *cache;
 
similarity index 86%
rename from arch/powerpc/platforms/pseries/eeh.c
rename to arch/powerpc/kernel/eeh.c
index 6b73d6c44f51dfb5125862e9b48852ea92af05a2..39954fe941b87e45a95951f9c39bb2beae6724db 100644 (file)
@@ -103,11 +103,8 @@ EXPORT_SYMBOL(eeh_subsystem_enabled);
  */
 int eeh_probe_mode;
 
-/* Global EEH mutex */
-DEFINE_MUTEX(eeh_mutex);
-
 /* Lock to avoid races due to multiple reports of an error */
-static DEFINE_RAW_SPINLOCK(confirm_error_lock);
+DEFINE_RAW_SPINLOCK(confirm_error_lock);
 
 /* Buffer for reporting pci register dumps. Its here in BSS, and
  * not dynamically alloced, so that it ends up in RMO where RTAS
@@ -235,16 +232,30 @@ void eeh_slot_error_detail(struct eeh_pe *pe, int severity)
 {
        size_t loglen = 0;
        struct eeh_dev *edev;
+       bool valid_cfg_log = true;
 
-       eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
-       eeh_ops->configure_bridge(pe);
-       eeh_pe_restore_bars(pe);
-
-       pci_regs_buf[0] = 0;
-       eeh_pe_for_each_dev(pe, edev) {
-               loglen += eeh_gather_pci_data(edev, pci_regs_buf,
-                               EEH_PCI_REGS_LOG_LEN);
-        }
+       /*
+        * When the PHB is fenced or dead, it's pointless to collect
+        * the data from PCI config space because it should return
+        * 0xFF's. For ER, we still retrieve the data from the PCI
+        * config space.
+        */
+       if (eeh_probe_mode_dev() &&
+           (pe->type & EEH_PE_PHB) &&
+           (pe->state & (EEH_PE_ISOLATED | EEH_PE_PHB_DEAD)))
+               valid_cfg_log = false;
+
+       if (valid_cfg_log) {
+               eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
+               eeh_ops->configure_bridge(pe);
+               eeh_pe_restore_bars(pe);
+
+               pci_regs_buf[0] = 0;
+               eeh_pe_for_each_dev(pe, edev) {
+                       loglen += eeh_gather_pci_data(edev, pci_regs_buf + loglen,
+                                                     EEH_PCI_REGS_LOG_LEN - loglen);
+               }
+       }
 
        eeh_ops->get_log(pe, severity, pci_regs_buf, loglen);
 }
@@ -260,15 +271,74 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
 {
        pte_t *ptep;
        unsigned long pa;
+       int hugepage_shift;
 
-       ptep = find_linux_pte(init_mm.pgd, token);
+       /*
+        * We won't find hugepages here, iomem
+        */
+       ptep = find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift);
        if (!ptep)
                return token;
+       WARN_ON(hugepage_shift);
        pa = pte_pfn(*ptep) << PAGE_SHIFT;
 
        return pa | (token & (PAGE_SIZE-1));
 }
 
+/*
+ * On PowerNV platform, we might already have fenced PHB there.
+ * For that case, it's meaningless to recover frozen PE. Intead,
+ * We have to handle fenced PHB firstly.
+ */
+static int eeh_phb_check_failure(struct eeh_pe *pe)
+{
+       struct eeh_pe *phb_pe;
+       unsigned long flags;
+       int ret;
+
+       if (!eeh_probe_mode_dev())
+               return -EPERM;
+
+       /* Find the PHB PE */
+       phb_pe = eeh_phb_pe_get(pe->phb);
+       if (!phb_pe) {
+               pr_warning("%s Can't find PE for PHB#%d\n",
+                          __func__, pe->phb->global_number);
+               return -EEXIST;
+       }
+
+       /* If the PHB has been in problematic state */
+       eeh_serialize_lock(&flags);
+       if (phb_pe->state & (EEH_PE_ISOLATED | EEH_PE_PHB_DEAD)) {
+               ret = 0;
+               goto out;
+       }
+
+       /* Check PHB state */
+       ret = eeh_ops->get_state(phb_pe, NULL);
+       if ((ret < 0) ||
+           (ret == EEH_STATE_NOT_SUPPORT) ||
+           (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
+           (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
+               ret = 0;
+               goto out;
+       }
+
+       /* Isolate the PHB and send event */
+       eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED);
+       eeh_serialize_unlock(flags);
+       eeh_send_failure_event(phb_pe);
+
+       pr_err("EEH: PHB#%x failure detected\n",
+               phb_pe->phb->global_number);
+       dump_stack();
+
+       return 1;
+out:
+       eeh_serialize_unlock(flags);
+       return ret;
+}
+
 /**
  * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze
  * @edev: eeh device
@@ -319,13 +389,21 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
                return 0;
        }
 
+       /*
+        * On PowerNV platform, we might already have fenced PHB
+        * there and we need take care of that firstly.
+        */
+       ret = eeh_phb_check_failure(pe);
+       if (ret > 0)
+               return ret;
+
        /* If we already have a pending isolation event for this
         * slot, we know it's bad already, we don't need to check.
         * Do this checking under a lock; as multiple PCI devices
         * in one slot might report errors simultaneously, and we
         * only want one error recovery routine running.
         */
-       raw_spin_lock_irqsave(&confirm_error_lock, flags);
+       eeh_serialize_lock(&flags);
        rc = 1;
        if (pe->state & EEH_PE_ISOLATED) {
                pe->check_count++;
@@ -368,13 +446,13 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
        }
 
        eeh_stats.slot_resets++;
+
        /* Avoid repeated reports of this failure, including problems
         * with other functions on this device, and functions under
         * bridges.
         */
        eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
-       raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
+       eeh_serialize_unlock(flags);
 
        eeh_send_failure_event(pe);
 
@@ -382,11 +460,14 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
         * a stack trace will help the device-driver authors figure
         * out what happened.  So print that out.
         */
-       WARN(1, "EEH: failure detected\n");
+       pr_err("EEH: Frozen PE#%x detected on PHB#%x\n",
+               pe->addr, pe->phb->global_number);
+       dump_stack();
+
        return 1;
 
 dn_unlock:
-       raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
+       eeh_serialize_unlock(flags);
        return rc;
 }
 
@@ -525,7 +606,7 @@ static void eeh_reset_pe_once(struct eeh_pe *pe)
         * or a fundamental reset (3).
         * A fundamental reset required by any device under
         * Partitionable Endpoint trumps hot-reset.
-        */
+        */
        eeh_pe_dev_traverse(pe, eeh_set_dev_freset, &freset);
 
        if (freset)
@@ -538,8 +619,8 @@ static void eeh_reset_pe_once(struct eeh_pe *pe)
         */
 #define PCI_BUS_RST_HOLD_TIME_MSEC 250
        msleep(PCI_BUS_RST_HOLD_TIME_MSEC);
-       
-       /* We might get hit with another EEH freeze as soon as the 
+
+       /* We might get hit with another EEH freeze as soon as the
         * pci slot reset line is dropped. Make sure we don't miss
         * these, and clear the flag now.
         */
@@ -565,6 +646,7 @@ static void eeh_reset_pe_once(struct eeh_pe *pe)
  */
 int eeh_reset_pe(struct eeh_pe *pe)
 {
+       int flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
        int i, rc;
 
        /* Take three shots at resetting the bus */
@@ -572,7 +654,7 @@ int eeh_reset_pe(struct eeh_pe *pe)
                eeh_reset_pe_once(pe);
 
                rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
-               if (rc == (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE))
+               if ((rc & flags) == flags)
                        return 0;
 
                if (rc < 0) {
@@ -604,7 +686,7 @@ void eeh_save_bars(struct eeh_dev *edev)
        if (!edev)
                return;
        dn = eeh_dev_to_of_node(edev);
-       
+
        for (i = 0; i < 16; i++)
                eeh_ops->read_config(dn, i * 4, 4, &edev->config_space[i]);
 }
@@ -674,11 +756,21 @@ int __exit eeh_ops_unregister(const char *name)
  * Even if force-off is set, the EEH hardware is still enabled, so that
  * newer systems can boot.
  */
-static int __init eeh_init(void)
+int eeh_init(void)
 {
        struct pci_controller *hose, *tmp;
        struct device_node *phb;
-       int ret;
+       static int cnt = 0;
+       int ret = 0;
+
+       /*
+        * We have to delay the initialization on PowerNV after
+        * the PCI hierarchy tree has been built because the PEs
+        * are figured out based on PCI devices instead of device
+        * tree nodes
+        */
+       if (machine_is(powernv) && cnt++ <= 0)
+               return ret;
 
        /* call platform initialization function */
        if (!eeh_ops) {
@@ -691,7 +783,10 @@ static int __init eeh_init(void)
                return ret;
        }
 
-       raw_spin_lock_init(&confirm_error_lock);
+       /* Initialize EEH event */
+       ret = eeh_event_init();
+       if (ret)
+               return ret;
 
        /* Enable EEH for all adapters */
        if (eeh_probe_mode_devtree()) {
@@ -700,6 +795,25 @@ static int __init eeh_init(void)
                        phb = hose->dn;
                        traverse_pci_devices(phb, eeh_ops->of_probe, NULL);
                }
+       } else if (eeh_probe_mode_dev()) {
+               list_for_each_entry_safe(hose, tmp,
+                       &hose_list, list_node)
+                       pci_walk_bus(hose->bus, eeh_ops->dev_probe, NULL);
+       } else {
+               pr_warning("%s: Invalid probe mode %d\n",
+                          __func__, eeh_probe_mode);
+               return -EINVAL;
+       }
+
+       /*
+        * Call platform post-initialization. Actually, It's good chance
+        * to inform platform that EEH is ready to supply service if the
+        * I/O cache stuff has been built up.
+        */
+       if (eeh_ops->post_init) {
+               ret = eeh_ops->post_init();
+               if (ret)
+                       return ret;
        }
 
        if (eeh_subsystem_enabled)
@@ -728,6 +842,14 @@ static void eeh_add_device_early(struct device_node *dn)
 {
        struct pci_controller *phb;
 
+       /*
+        * If we're doing EEH probe based on PCI device, we
+        * would delay the probe until late stage because
+        * the PCI device isn't available this moment.
+        */
+       if (!eeh_probe_mode_devtree())
+               return;
+
        if (!of_node_to_eeh_dev(dn))
                return;
        phb = of_node_to_eeh_dev(dn)->phb;
@@ -736,7 +858,6 @@ static void eeh_add_device_early(struct device_node *dn)
        if (NULL == phb || 0 == phb->buid)
                return;
 
-       /* FIXME: hotplug support on POWERNV */
        eeh_ops->of_probe(dn, NULL);
 }
 
@@ -787,6 +908,13 @@ static void eeh_add_device_late(struct pci_dev *dev)
        edev->pdev = dev;
        dev->dev.archdata.edev = edev;
 
+       /*
+        * We have to do the EEH probe here because the PCI device
+        * hasn't been created yet in the early stage.
+        */
+       if (eeh_probe_mode_dev())
+               eeh_ops->dev_probe(dev, NULL);
+
        eeh_addr_cache_insert_dev(dev);
 }
 
@@ -803,12 +931,12 @@ void eeh_add_device_tree_late(struct pci_bus *bus)
        struct pci_dev *dev;
 
        list_for_each_entry(dev, &bus->devices, bus_list) {
-               eeh_add_device_late(dev);
-               if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
-                       struct pci_bus *subbus = dev->subordinate;
-                       if (subbus)
-                               eeh_add_device_tree_late(subbus);
-               }
+               eeh_add_device_late(dev);
+               if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+                       struct pci_bus *subbus = dev->subordinate;
+                       if (subbus)
+                               eeh_add_device_tree_late(subbus);
+               }
        }
 }
 EXPORT_SYMBOL_GPL(eeh_add_device_tree_late);
similarity index 99%
rename from arch/powerpc/platforms/pseries/eeh_cache.c
rename to arch/powerpc/kernel/eeh_cache.c
index 5ce3ba7ad1372a6de232692e23b3c411bc1ed33f..f9ac1232a746c595d696ef7b66469a093070c877 100644 (file)
@@ -194,7 +194,7 @@ static void __eeh_addr_cache_insert_dev(struct pci_dev *dev)
        }
 
        /* Skip any devices for which EEH is not enabled. */
-       if (!edev->pe) {
+       if (!eeh_probe_mode_dev() && !edev->pe) {
 #ifdef DEBUG
                pr_info("PCI: skip building address cache for=%s - %s\n",
                        pci_name(dev), dn->full_name);
@@ -285,7 +285,7 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev)
  * Must be run late in boot process, after the pci controllers
  * have been scanned for devices (after all device resources are known).
  */
-void __init eeh_addr_cache_build(void)
+void eeh_addr_cache_build(void)
 {
        struct device_node *dn;
        struct eeh_dev *edev;
@@ -316,4 +316,3 @@ void __init eeh_addr_cache_build(void)
        eeh_addr_cache_print(&pci_io_addr_cache_root);
 #endif
 }
-
similarity index 81%
rename from arch/powerpc/platforms/pseries/eeh_driver.c
rename to arch/powerpc/kernel/eeh_driver.c
index a3fefb61097c76f513c1bf48378120e1c4c74e59..2b1ce17cae504d95a5be5e04e16828a5e3f7516e 100644 (file)
@@ -154,9 +154,9 @@ static void eeh_enable_irq(struct pci_dev *dev)
  * eeh_report_error - Report pci error to each device driver
  * @data: eeh device
  * @userdata: return value
- * 
- * Report an EEH error to each device driver, collect up and 
- * merge the device driver responses. Cumulative response 
+ *
+ * Report an EEH error to each device driver, collect up and
+ * merge the device driver responses. Cumulative response
  * passed back in "userdata".
  */
 static void *eeh_report_error(void *data, void *userdata)
@@ -349,10 +349,12 @@ static void *eeh_report_failure(void *data, void *userdata)
  */
 static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
 {
+       struct timeval tstamp;
        int cnt, rc;
 
        /* pcibios will clear the counter; save the value */
        cnt = pe->freeze_count;
+       tstamp = pe->tstamp;
 
        /*
         * We don't remove the corresponding PE instances because
@@ -376,15 +378,17 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
        eeh_pe_restore_bars(pe);
 
        /* Give the system 5 seconds to finish running the user-space
-        * hotplug shutdown scripts, e.g. ifdown for ethernet.  Yes, 
-        * this is a hack, but if we don't do this, and try to bring 
-        * the device up before the scripts have taken it down, 
+        * hotplug shutdown scripts, e.g. ifdown for ethernet.  Yes,
+        * this is a hack, but if we don't do this, and try to bring
+        * the device up before the scripts have taken it down,
         * potentially weird things happen.
         */
        if (bus) {
                ssleep(5);
                pcibios_add_pci_devices(bus);
        }
+
+       pe->tstamp = tstamp;
        pe->freeze_count = cnt;
 
        return 0;
@@ -395,24 +399,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
  */
 #define MAX_WAIT_FOR_RECOVERY 150
 
-/**
- * eeh_handle_event - Reset a PCI device after hard lockup.
- * @pe: EEH PE
- *
- * While PHB detects address or data parity errors on particular PCI
- * slot, the associated PE will be frozen. Besides, DMA's occurring
- * to wild addresses (which usually happen due to bugs in device
- * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
- * #PERR or other misc PCI-related errors also can trigger EEH errors.
- *
- * Recovery process consists of unplugging the device driver (which
- * generated hotplug events to userspace), then issuing a PCI #RST to
- * the device, then reconfiguring the PCI config space for all bridges
- * & devices under this slot, and then finally restarting the device
- * drivers (which cause a second set of hotplug events to go out to
- * userspace).
- */
-void eeh_handle_event(struct eeh_pe *pe)
+static void eeh_handle_normal_event(struct eeh_pe *pe)
 {
        struct pci_bus *frozen_bus;
        int rc = 0;
@@ -425,6 +412,7 @@ void eeh_handle_event(struct eeh_pe *pe)
                return;
        }
 
+       eeh_pe_update_time_stamp(pe);
        pe->freeze_count++;
        if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES)
                goto excess_failures;
@@ -437,6 +425,7 @@ void eeh_handle_event(struct eeh_pe *pe)
         * status ... if any child can't handle the reset, then the entire
         * slot is dlpar removed and added.
         */
+       pr_info("EEH: Notify device drivers to shutdown\n");
        eeh_pe_dev_traverse(pe, eeh_report_error, &result);
 
        /* Get the current PCI slot state. This can take a long time,
@@ -444,7 +433,7 @@ void eeh_handle_event(struct eeh_pe *pe)
         */
        rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
        if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
-               printk(KERN_WARNING "EEH: Permanent failure\n");
+               pr_warning("EEH: Permanent failure\n");
                goto hard_fail;
        }
 
@@ -452,6 +441,7 @@ void eeh_handle_event(struct eeh_pe *pe)
         * don't post the error log until after all dev drivers
         * have been informed.
         */
+       pr_info("EEH: Collect temporary log\n");
        eeh_slot_error_detail(pe, EEH_LOG_TEMP);
 
        /* If all device drivers were EEH-unaware, then shut
@@ -459,15 +449,18 @@ void eeh_handle_event(struct eeh_pe *pe)
         * go down willingly, without panicing the system.
         */
        if (result == PCI_ERS_RESULT_NONE) {
+               pr_info("EEH: Reset with hotplug activity\n");
                rc = eeh_reset_device(pe, frozen_bus);
                if (rc) {
-                       printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc);
+                       pr_warning("%s: Unable to reset, err=%d\n",
+                                  __func__, rc);
                        goto hard_fail;
                }
        }
 
        /* If all devices reported they can proceed, then re-enable MMIO */
        if (result == PCI_ERS_RESULT_CAN_RECOVER) {
+               pr_info("EEH: Enable I/O for affected devices\n");
                rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
 
                if (rc < 0)
@@ -475,6 +468,7 @@ void eeh_handle_event(struct eeh_pe *pe)
                if (rc) {
                        result = PCI_ERS_RESULT_NEED_RESET;
                } else {
+                       pr_info("EEH: Notify device drivers to resume I/O\n");
                        result = PCI_ERS_RESULT_NONE;
                        eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result);
                }
@@ -482,6 +476,7 @@ void eeh_handle_event(struct eeh_pe *pe)
 
        /* If all devices reported they can proceed, then re-enable DMA */
        if (result == PCI_ERS_RESULT_CAN_RECOVER) {
+               pr_info("EEH: Enabled DMA for affected devices\n");
                rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
 
                if (rc < 0)
@@ -494,17 +489,22 @@ void eeh_handle_event(struct eeh_pe *pe)
 
        /* If any device has a hard failure, then shut off everything. */
        if (result == PCI_ERS_RESULT_DISCONNECT) {
-               printk(KERN_WARNING "EEH: Device driver gave up\n");
+               pr_warning("EEH: Device driver gave up\n");
                goto hard_fail;
        }
 
        /* If any device called out for a reset, then reset the slot */
        if (result == PCI_ERS_RESULT_NEED_RESET) {
+               pr_info("EEH: Reset without hotplug activity\n");
                rc = eeh_reset_device(pe, NULL);
                if (rc) {
-                       printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc);
+                       pr_warning("%s: Cannot reset, err=%d\n",
+                                  __func__, rc);
                        goto hard_fail;
                }
+
+               pr_info("EEH: Notify device drivers "
+                       "the completion of reset\n");
                result = PCI_ERS_RESULT_NONE;
                eeh_pe_dev_traverse(pe, eeh_report_reset, &result);
        }
@@ -512,15 +512,16 @@ void eeh_handle_event(struct eeh_pe *pe)
        /* All devices should claim they have recovered by now. */
        if ((result != PCI_ERS_RESULT_RECOVERED) &&
            (result != PCI_ERS_RESULT_NONE)) {
-               printk(KERN_WARNING "EEH: Not recovered\n");
+               pr_warning("EEH: Not recovered\n");
                goto hard_fail;
        }
 
        /* Tell all device drivers that they can resume operations */
+       pr_info("EEH: Notify device driver to resume\n");
        eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
 
        return;
-       
+
 excess_failures:
        /*
         * About 90% of all real-life EEH failures in the field
@@ -550,3 +551,111 @@ perm_error:
                pcibios_remove_pci_devices(frozen_bus);
 }
 
+static void eeh_handle_special_event(void)
+{
+       struct eeh_pe *pe, *phb_pe;
+       struct pci_bus *bus;
+       struct pci_controller *hose, *tmp;
+       unsigned long flags;
+       int rc = 0;
+
+       /*
+        * The return value from next_error() has been classified as follows.
+        * It might be good to enumerate them. However, next_error() is only
+        * supported by PowerNV platform for now. So it would be fine to use
+        * integer directly:
+        *
+        * 4 - Dead IOC           3 - Dead PHB
+        * 2 - Fenced PHB         1 - Frozen PE
+        * 0 - No error found
+        *
+        */
+       rc = eeh_ops->next_error(&pe);
+       if (rc <= 0)
+               return;
+
+       switch (rc) {
+       case 4:
+               /* Mark all PHBs in dead state */
+               eeh_serialize_lock(&flags);
+               list_for_each_entry_safe(hose, tmp,
+                               &hose_list, list_node) {
+                       phb_pe = eeh_phb_pe_get(hose);
+                       if (!phb_pe) continue;
+
+                       eeh_pe_state_mark(phb_pe,
+                               EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+               }
+               eeh_serialize_unlock(flags);
+
+               /* Purge all events */
+               eeh_remove_event(NULL);
+               break;
+       case 3:
+       case 2:
+       case 1:
+               /* Mark the PE in fenced state */
+               eeh_serialize_lock(&flags);
+               if (rc == 3)
+                       eeh_pe_state_mark(pe,
+                               EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+               else
+                       eeh_pe_state_mark(pe,
+                               EEH_PE_ISOLATED | EEH_PE_RECOVERING);
+               eeh_serialize_unlock(flags);
+
+               /* Purge all events of the PHB */
+               eeh_remove_event(pe);
+               break;
+       default:
+               pr_err("%s: Invalid value %d from next_error()\n",
+                      __func__, rc);
+               return;
+       }
+
+       /*
+        * For fenced PHB and frozen PE, it's handled as normal
+        * event. We have to remove the affected PHBs for dead
+        * PHB and IOC
+        */
+       if (rc == 2 || rc == 1)
+               eeh_handle_normal_event(pe);
+       else {
+               list_for_each_entry_safe(hose, tmp,
+                       &hose_list, list_node) {
+                       phb_pe = eeh_phb_pe_get(hose);
+                       if (!phb_pe || !(phb_pe->state & EEH_PE_PHB_DEAD))
+                               continue;
+
+                       bus = eeh_pe_bus_get(phb_pe);
+                       /* Notify all devices that they're about to go down. */
+                       eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
+                       pcibios_remove_pci_devices(bus);
+               }
+       }
+}
+
+/**
+ * eeh_handle_event - Reset a PCI device after hard lockup.
+ * @pe: EEH PE
+ *
+ * While PHB detects address or data parity errors on particular PCI
+ * slot, the associated PE will be frozen. Besides, DMA's occurring
+ * to wild addresses (which usually happen due to bugs in device
+ * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
+ * #PERR or other misc PCI-related errors also can trigger EEH errors.
+ *
+ * Recovery process consists of unplugging the device driver (which
+ * generated hotplug events to userspace), then issuing a PCI #RST to
+ * the device, then reconfiguring the PCI config space for all bridges
+ * & devices under this slot, and then finally restarting the device
+ * drivers (which cause a second set of hotplug events to go out to
+ * userspace).
+ */
+void eeh_handle_event(struct eeh_pe *pe)
+{
+       if (pe)
+               eeh_handle_normal_event(pe);
+       else
+               eeh_handle_special_event();
+}
similarity index 56%
rename from arch/powerpc/platforms/pseries/eeh_event.c
rename to arch/powerpc/kernel/eeh_event.c
index 185bedd926df7258bef7070d97c68f577bf91298..d27c5afc90aecfbe41506814d3a0c3891bdacc4b 100644 (file)
 
 #include <linux/delay.h>
 #include <linux/list.h>
-#include <linux/mutex.h>
 #include <linux/sched.h>
+#include <linux/semaphore.h>
 #include <linux/pci.h>
 #include <linux/slab.h>
-#include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <asm/eeh_event.h>
 #include <asm/ppc-pci.h>
  *  work-queue, where a worker thread can drive recovery.
  */
 
-/* EEH event workqueue setup. */
 static DEFINE_SPINLOCK(eeh_eventlist_lock);
+static struct semaphore eeh_eventlist_sem;
 LIST_HEAD(eeh_eventlist);
-static void eeh_thread_launcher(struct work_struct *);
-DECLARE_WORK(eeh_event_wq, eeh_thread_launcher);
-
-/* Serialize reset sequences for a given pci device */
-DEFINE_MUTEX(eeh_event_mutex);
 
 /**
  * eeh_event_handler - Dispatch EEH events.
@@ -60,55 +54,63 @@ static int eeh_event_handler(void * dummy)
        struct eeh_event *event;
        struct eeh_pe *pe;
 
-       spin_lock_irqsave(&eeh_eventlist_lock, flags);
-       event = NULL;
-
-       /* Unqueue the event, get ready to process. */
-       if (!list_empty(&eeh_eventlist)) {
-               event = list_entry(eeh_eventlist.next, struct eeh_event, list);
-               list_del(&event->list);
-       }
-       spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
-
-       if (event == NULL)
-               return 0;
-
-       /* Serialize processing of EEH events */
-       mutex_lock(&eeh_event_mutex);
-       pe = event->pe;
-       eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
-       pr_info("EEH: Detected PCI bus error on PHB#%d-PE#%x\n",
-               pe->phb->global_number, pe->addr);
-
-       set_current_state(TASK_INTERRUPTIBLE);  /* Don't add to load average */
-       eeh_handle_event(pe);
-       eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
-
-       kfree(event);
-       mutex_unlock(&eeh_event_mutex);
-
-       /* If there are no new errors after an hour, clear the counter. */
-       if (pe && pe->freeze_count > 0) {
-               msleep_interruptible(3600*1000);
-               if (pe->freeze_count > 0)
-                       pe->freeze_count--;
-
+       while (!kthread_should_stop()) {
+               if (down_interruptible(&eeh_eventlist_sem))
+                       break;
+
+               /* Fetch EEH event from the queue */
+               spin_lock_irqsave(&eeh_eventlist_lock, flags);
+               event = NULL;
+               if (!list_empty(&eeh_eventlist)) {
+                       event = list_entry(eeh_eventlist.next,
+                                          struct eeh_event, list);
+                       list_del(&event->list);
+               }
+               spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+               if (!event)
+                       continue;
+
+               /* We might have event without binding PE */
+               pe = event->pe;
+               if (pe) {
+                       eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
+                       pr_info("EEH: Detected PCI bus error on PHB#%d-PE#%x\n",
+                                pe->phb->global_number, pe->addr);
+                       eeh_handle_event(pe);
+                       eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
+               } else {
+                       eeh_handle_event(NULL);
+               }
+
+               kfree(event);
        }
 
        return 0;
 }
 
 /**
- * eeh_thread_launcher - Start kernel thread to handle EEH events
- * @dummy - unused
+ * eeh_event_init - Start kernel thread to handle EEH events
  *
  * This routine is called to start the kernel thread for processing
  * EEH event.
  */
-static void eeh_thread_launcher(struct work_struct *dummy)
+int eeh_event_init(void)
 {
-       if (IS_ERR(kthread_run(eeh_event_handler, NULL, "eehd")))
-               printk(KERN_ERR "Failed to start EEH daemon\n");
+       struct task_struct *t;
+       int ret = 0;
+
+       /* Initialize semaphore */
+       sema_init(&eeh_eventlist_sem, 0);
+
+       t = kthread_run(eeh_event_handler, NULL, "eehd");
+       if (IS_ERR(t)) {
+               ret = PTR_ERR(t);
+               pr_err("%s: Failed to start EEH daemon (%d)\n",
+                       __func__, ret);
+               return ret;
+       }
+
+       return 0;
 }
 
 /**
@@ -136,7 +138,45 @@ int eeh_send_failure_event(struct eeh_pe *pe)
        list_add(&event->list, &eeh_eventlist);
        spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
 
-       schedule_work(&eeh_event_wq);
+       /* For EEH deamon to knick in */
+       up(&eeh_eventlist_sem);
 
        return 0;
 }
+
+/**
+ * eeh_remove_event - Remove EEH event from the queue
+ * @pe: Event binding to the PE
+ *
+ * On PowerNV platform, we might have subsequent coming events
+ * is part of the former one. For that case, those subsequent
+ * coming events are totally duplicated and unnecessary, thus
+ * they should be removed.
+ */
+void eeh_remove_event(struct eeh_pe *pe)
+{
+       unsigned long flags;
+       struct eeh_event *event, *tmp;
+
+       spin_lock_irqsave(&eeh_eventlist_lock, flags);
+       list_for_each_entry_safe(event, tmp, &eeh_eventlist, list) {
+               /*
+                * If we don't have valid PE passed in, that means
+                * we already have event corresponding to dead IOC
+                * and all events should be purged.
+                */
+               if (!pe) {
+                       list_del(&event->list);
+                       kfree(event);
+               } else if (pe->type & EEH_PE_PHB) {
+                       if (event->pe && event->pe->phb == pe->phb) {
+                               list_del(&event->list);
+                               kfree(event);
+                       }
+               } else if (event->pe == pe) {
+                       list_del(&event->list);
+                       kfree(event);
+               }
+       }
+       spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+}
similarity index 75%
rename from arch/powerpc/platforms/pseries/eeh_pe.c
rename to arch/powerpc/kernel/eeh_pe.c
index 9d4a9e8562b2229a7791bb8b069dc13c9c2c9a73..016588a6f5ede19c9c45b0ab6a3bd44cfce962ae 100644 (file)
@@ -22,6 +22,7 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
+#include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/gfp.h>
 #include <linux/init.h>
@@ -78,9 +79,7 @@ int eeh_phb_pe_create(struct pci_controller *phb)
        }
 
        /* Put it into the list */
-       eeh_lock();
        list_add_tail(&pe->child, &eeh_phb_pe);
-       eeh_unlock();
 
        pr_debug("EEH: Add PE for PHB#%d\n", phb->global_number);
 
@@ -95,7 +94,7 @@ int eeh_phb_pe_create(struct pci_controller *phb)
  * hierarchy tree is composed of PHB PEs. The function is used
  * to retrieve the corresponding PHB PE according to the given PHB.
  */
-static struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb)
+struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb)
 {
        struct eeh_pe *pe;
 
@@ -185,21 +184,15 @@ void *eeh_pe_dev_traverse(struct eeh_pe *root,
                return NULL;
        }
 
-       eeh_lock();
-
        /* Traverse root PE */
        for (pe = root; pe; pe = eeh_pe_next(pe, root)) {
                eeh_pe_for_each_dev(pe, edev) {
                        ret = fn(edev, flag);
-                       if (ret) {
-                               eeh_unlock();
+                       if (ret)
                                return ret;
-                       }
                }
        }
 
-       eeh_unlock();
-
        return NULL;
 }
 
@@ -228,7 +221,7 @@ static void *__eeh_pe_get(void *data, void *flag)
                return pe;
 
        /* Try BDF address */
-       if (edev->pe_config_addr &&
+       if (edev->config_addr &&
           (edev->config_addr == pe->config_addr))
                return pe;
 
@@ -246,7 +239,7 @@ static void *__eeh_pe_get(void *data, void *flag)
  * which is composed of PCI bus/device/function number, or unified
  * PE address.
  */
-static struct eeh_pe *eeh_pe_get(struct eeh_dev *edev)
+struct eeh_pe *eeh_pe_get(struct eeh_dev *edev)
 {
        struct eeh_pe *root = eeh_phb_pe_get(edev->phb);
        struct eeh_pe *pe;
@@ -305,8 +298,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 {
        struct eeh_pe *pe, *parent;
 
-       eeh_lock();
-
        /*
         * Search the PE has been existing or not according
         * to the PE address. If that has been existing, the
@@ -316,7 +307,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
        pe = eeh_pe_get(edev);
        if (pe && !(pe->type & EEH_PE_INVALID)) {
                if (!edev->pe_config_addr) {
-                       eeh_unlock();
                        pr_err("%s: PE with addr 0x%x already exists\n",
                                __func__, edev->config_addr);
                        return -EEXIST;
@@ -328,7 +318,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 
                /* Put the edev to PE */
                list_add_tail(&edev->list, &pe->edevs);
-               eeh_unlock();
                pr_debug("EEH: Add %s to Bus PE#%x\n",
                        edev->dn->full_name, pe->addr);
 
@@ -347,7 +336,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
                        parent->type &= ~EEH_PE_INVALID;
                        parent = parent->parent;
                }
-               eeh_unlock();
                pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n",
                        edev->dn->full_name, pe->addr, pe->parent->addr);
 
@@ -357,13 +345,23 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
        /* Create a new EEH PE */
        pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
        if (!pe) {
-               eeh_unlock();
                pr_err("%s: out of memory!\n", __func__);
                return -ENOMEM;
        }
        pe->addr        = edev->pe_config_addr;
        pe->config_addr = edev->config_addr;
 
+       /*
+        * While doing PE reset, we probably hot-reset the
+        * upstream bridge. However, the PCI devices including
+        * the associated EEH devices might be removed when EEH
+        * core is doing recovery. So that won't safe to retrieve
+        * the bridge through downstream EEH device. We have to
+        * trace the parent PCI bus, then the upstream bridge.
+        */
+       if (eeh_probe_mode_dev())
+               pe->bus = eeh_dev_to_pci_dev(edev)->bus;
+
        /*
         * Put the new EEH PE into hierarchy tree. If the parent
         * can't be found, the newly created PE will be attached
@@ -374,7 +372,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
        if (!parent) {
                parent = eeh_phb_pe_get(edev->phb);
                if (!parent) {
-                       eeh_unlock();
                        pr_err("%s: No PHB PE is found (PHB Domain=%d)\n",
                                __func__, edev->phb->global_number);
                        edev->pe = NULL;
@@ -391,7 +388,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
        list_add_tail(&pe->child, &parent->child_list);
        list_add_tail(&edev->list, &pe->edevs);
        edev->pe = pe;
-       eeh_unlock();
        pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n",
                edev->dn->full_name, pe->addr, pe->parent->addr);
 
@@ -419,8 +415,6 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe)
                return -EEXIST;
        }
 
-       eeh_lock();
-
        /* Remove the EEH device */
        pe = edev->pe;
        edev->pe = NULL;
@@ -465,11 +459,36 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe)
                pe = parent;
        }
 
-       eeh_unlock();
-
        return 0;
 }
 
+/**
+ * eeh_pe_update_time_stamp - Update PE's frozen time stamp
+ * @pe: EEH PE
+ *
+ * We have time stamp for each PE to trace its time of getting
+ * frozen in last hour. The function should be called to update
+ * the time stamp on first error of the specific PE. On the other
+ * handle, we needn't account for errors happened in last hour.
+ */
+void eeh_pe_update_time_stamp(struct eeh_pe *pe)
+{
+       struct timeval tstamp;
+
+       if (!pe) return;
+
+       if (pe->freeze_count <= 0) {
+               pe->freeze_count = 0;
+               do_gettimeofday(&pe->tstamp);
+       } else {
+               do_gettimeofday(&tstamp);
+               if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) {
+                       pe->tstamp = tstamp;
+                       pe->freeze_count = 0;
+               }
+       }
+}
+
 /**
  * __eeh_pe_state_mark - Mark the state for the PE
  * @data: EEH PE
@@ -512,9 +531,7 @@ static void *__eeh_pe_state_mark(void *data, void *flag)
  */
 void eeh_pe_state_mark(struct eeh_pe *pe, int state)
 {
-       eeh_lock();
        eeh_pe_traverse(pe, __eeh_pe_state_mark, &state);
-       eeh_unlock();
 }
 
 /**
@@ -548,35 +565,135 @@ static void *__eeh_pe_state_clear(void *data, void *flag)
  */
 void eeh_pe_state_clear(struct eeh_pe *pe, int state)
 {
-       eeh_lock();
        eeh_pe_traverse(pe, __eeh_pe_state_clear, &state);
-       eeh_unlock();
 }
 
-/**
- * eeh_restore_one_device_bars - Restore the Base Address Registers for one device
- * @data: EEH device
- * @flag: Unused
+/*
+ * Some PCI bridges (e.g. PLX bridges) have primary/secondary
+ * buses assigned explicitly by firmware, and we probably have
+ * lost that after reset. So we have to delay the check until
+ * the PCI-CFG registers have been restored for the parent
+ * bridge.
  *
- * Loads the PCI configuration space base address registers,
- * the expansion ROM base address, the latency timer, and etc.
- * from the saved values in the device node.
+ * Don't use normal PCI-CFG accessors, which probably has been
+ * blocked on normal path during the stage. So we need utilize
+ * eeh operations, which is always permitted.
  */
-static void *eeh_restore_one_device_bars(void *data, void *flag)
+static void eeh_bridge_check_link(struct pci_dev *pdev,
+                                 struct device_node *dn)
+{
+       int cap;
+       uint32_t val;
+       int timeout = 0;
+
+       /*
+        * We only check root port and downstream ports of
+        * PCIe switches
+        */
+       if (!pci_is_pcie(pdev) ||
+           (pci_pcie_type(pdev) != PCI_EXP_TYPE_ROOT_PORT &&
+            pci_pcie_type(pdev) != PCI_EXP_TYPE_DOWNSTREAM))
+               return;
+
+       pr_debug("%s: Check PCIe link for %s ...\n",
+                __func__, pci_name(pdev));
+
+       /* Check slot status */
+       cap = pdev->pcie_cap;
+       eeh_ops->read_config(dn, cap + PCI_EXP_SLTSTA, 2, &val);
+       if (!(val & PCI_EXP_SLTSTA_PDS)) {
+               pr_debug("  No card in the slot (0x%04x) !\n", val);
+               return;
+       }
+
+       /* Check power status if we have the capability */
+       eeh_ops->read_config(dn, cap + PCI_EXP_SLTCAP, 2, &val);
+       if (val & PCI_EXP_SLTCAP_PCP) {
+               eeh_ops->read_config(dn, cap + PCI_EXP_SLTCTL, 2, &val);
+               if (val & PCI_EXP_SLTCTL_PCC) {
+                       pr_debug("  In power-off state, power it on ...\n");
+                       val &= ~(PCI_EXP_SLTCTL_PCC | PCI_EXP_SLTCTL_PIC);
+                       val |= (0x0100 & PCI_EXP_SLTCTL_PIC);
+                       eeh_ops->write_config(dn, cap + PCI_EXP_SLTCTL, 2, val);
+                       msleep(2 * 1000);
+               }
+       }
+
+       /* Enable link */
+       eeh_ops->read_config(dn, cap + PCI_EXP_LNKCTL, 2, &val);
+       val &= ~PCI_EXP_LNKCTL_LD;
+       eeh_ops->write_config(dn, cap + PCI_EXP_LNKCTL, 2, val);
+
+       /* Check link */
+       eeh_ops->read_config(dn, cap + PCI_EXP_LNKCAP, 4, &val);
+       if (!(val & PCI_EXP_LNKCAP_DLLLARC)) {
+               pr_debug("  No link reporting capability (0x%08x) \n", val);
+               msleep(1000);
+               return;
+       }
+
+       /* Wait the link is up until timeout (5s) */
+       timeout = 0;
+       while (timeout < 5000) {
+               msleep(20);
+               timeout += 20;
+
+               eeh_ops->read_config(dn, cap + PCI_EXP_LNKSTA, 2, &val);
+               if (val & PCI_EXP_LNKSTA_DLLLA)
+                       break;
+       }
+
+       if (val & PCI_EXP_LNKSTA_DLLLA)
+               pr_debug("  Link up (%s)\n",
+                        (val & PCI_EXP_LNKSTA_CLS_2_5GB) ? "2.5GB" : "5GB");
+       else
+               pr_debug("  Link not ready (0x%04x)\n", val);
+}
+
+#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
+#define SAVED_BYTE(OFF)        (((u8 *)(edev->config_space))[BYTE_SWAP(OFF)])
+
+static void eeh_restore_bridge_bars(struct pci_dev *pdev,
+                                   struct eeh_dev *edev,
+                                   struct device_node *dn)
+{
+       int i;
+
+       /*
+        * Device BARs: 0x10 - 0x18
+        * Bus numbers and windows: 0x18 - 0x30
+        */
+       for (i = 4; i < 13; i++)
+               eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]);
+       /* Rom: 0x38 */
+       eeh_ops->write_config(dn, 14*4, 4, edev->config_space[14]);
+
+       /* Cache line & Latency timer: 0xC 0xD */
+       eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1,
+                SAVED_BYTE(PCI_CACHE_LINE_SIZE));
+        eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1,
+                SAVED_BYTE(PCI_LATENCY_TIMER));
+       /* Max latency, min grant, interrupt ping and line: 0x3C */
+       eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]);
+
+       /* PCI Command: 0x4 */
+       eeh_ops->write_config(dn, PCI_COMMAND, 4, edev->config_space[1]);
+
+       /* Check the PCIe link is ready */
+       eeh_bridge_check_link(pdev, dn);
+}
+
+static void eeh_restore_device_bars(struct eeh_dev *edev,
+                                   struct device_node *dn)
 {
        int i;
        u32 cmd;
-       struct eeh_dev *edev = (struct eeh_dev *)data;
-       struct device_node *dn = eeh_dev_to_of_node(edev);
 
        for (i = 4; i < 10; i++)
                eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]);
        /* 12 == Expansion ROM Address */
        eeh_ops->write_config(dn, 12*4, 4, edev->config_space[12]);
 
-#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
-#define SAVED_BYTE(OFF) (((u8 *)(edev->config_space))[BYTE_SWAP(OFF)])
-
        eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1,
                SAVED_BYTE(PCI_CACHE_LINE_SIZE));
        eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1,
@@ -599,6 +716,34 @@ static void *eeh_restore_one_device_bars(void *data, void *flag)
        else
                cmd &= ~PCI_COMMAND_SERR;
        eeh_ops->write_config(dn, PCI_COMMAND, 4, cmd);
+}
+
+/**
+ * eeh_restore_one_device_bars - Restore the Base Address Registers for one device
+ * @data: EEH device
+ * @flag: Unused
+ *
+ * Loads the PCI configuration space base address registers,
+ * the expansion ROM base address, the latency timer, and etc.
+ * from the saved values in the device node.
+ */
+static void *eeh_restore_one_device_bars(void *data, void *flag)
+{
+       struct pci_dev *pdev = NULL;
+       struct eeh_dev *edev = (struct eeh_dev *)data;
+       struct device_node *dn = eeh_dev_to_of_node(edev);
+
+       /* Trace the PCI bridge */
+       if (eeh_probe_mode_dev()) {
+               pdev = eeh_dev_to_pci_dev(edev);
+               if (pdev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
+                        pdev = NULL;
+        }
+
+       if (pdev)
+               eeh_restore_bridge_bars(pdev, edev, dn);
+       else
+               eeh_restore_device_bars(edev, dn);
 
        return NULL;
 }
@@ -635,19 +780,21 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
        struct eeh_dev *edev;
        struct pci_dev *pdev;
 
-       eeh_lock();
-
        if (pe->type & EEH_PE_PHB) {
                bus = pe->phb->bus;
        } else if (pe->type & EEH_PE_BUS ||
                   pe->type & EEH_PE_DEVICE) {
+               if (pe->bus) {
+                       bus = pe->bus;
+                       goto out;
+               }
+
                edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
                pdev = eeh_dev_to_pci_dev(edev);
                if (pdev)
                        bus = pdev->bus;
        }
 
-       eeh_unlock();
-
+out:
        return bus;
 }
similarity index 99%
rename from arch/powerpc/platforms/pseries/eeh_sysfs.c
rename to arch/powerpc/kernel/eeh_sysfs.c
index d37708360f2e472b68d226df3da5a372ecfd406e..e7ae3484918c29639ff1a1baa597579a973fe6cd 100644 (file)
@@ -72,4 +72,3 @@ void eeh_sysfs_remove_device(struct pci_dev *pdev)
        device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr);
        device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
 }
-
index 8741c854e03d50800cba18ad091f3d843c6a0111..ab15b8d057ad361f609b06277eb0a993574098cd 100644 (file)
@@ -629,21 +629,43 @@ _GLOBAL(ret_from_except_lite)
 
        CURRENT_THREAD_INFO(r9, r1)
        ld      r3,_MSR(r1)
+#ifdef CONFIG_PPC_BOOK3E
+       ld      r10,PACACURRENT(r13)
+#endif /* CONFIG_PPC_BOOK3E */
        ld      r4,TI_FLAGS(r9)
        andi.   r3,r3,MSR_PR
        beq     resume_kernel
+#ifdef CONFIG_PPC_BOOK3E
+       lwz     r3,(THREAD+THREAD_DBCR0)(r10)
+#endif /* CONFIG_PPC_BOOK3E */
 
        /* Check current_thread_info()->flags */
        andi.   r0,r4,_TIF_USER_WORK_MASK
+#ifdef CONFIG_PPC_BOOK3E
+       bne     1f
+       /*
+        * Check to see if the dbcr0 register is set up to debug.
+        * Use the internal debug mode bit to do this.
+        */
+       andis.  r0,r3,DBCR0_IDM@h
        beq     restore
-
-       andi.   r0,r4,_TIF_NEED_RESCHED
-       beq     1f
+       mfmsr   r0
+       rlwinm  r0,r0,0,~MSR_DE /* Clear MSR.DE */
+       mtmsr   r0
+       mtspr   SPRN_DBCR0,r3
+       li      r10, -1
+       mtspr   SPRN_DBSR,r10
+       b       restore
+#else
+       beq     restore
+#endif
+1:     andi.   r0,r4,_TIF_NEED_RESCHED
+       beq     2f
        bl      .restore_interrupts
        SCHEDULE_USER
        b       .ret_from_except_lite
 
-1:     bl      .save_nvgprs
+2:     bl      .save_nvgprs
        bl      .restore_interrupts
        addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      .do_notify_resume
index 40e4a17c8ba0f249e2b65d85d4014fe7aa47f7a7..4e00d223b2e30924dba5e82370d5bae09e7b56dc 100644 (file)
@@ -341,10 +341,17 @@ vsx_unavailable_pSeries_1:
        EXCEPTION_PROLOG_0(PACA_EXGEN)
        b       vsx_unavailable_pSeries
 
+facility_unavailable_trampoline:
        . = 0xf60
        SET_SCRATCH0(r13)
        EXCEPTION_PROLOG_0(PACA_EXGEN)
-       b       tm_unavailable_pSeries
+       b       facility_unavailable_pSeries
+
+hv_facility_unavailable_trampoline:
+       . = 0xf80
+       SET_SCRATCH0(r13)
+       EXCEPTION_PROLOG_0(PACA_EXGEN)
+       b       facility_unavailable_hv
 
 #ifdef CONFIG_CBE_RAS
        STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
@@ -522,8 +529,10 @@ denorm_done:
        KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf20)
        STD_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
        KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40)
-       STD_EXCEPTION_PSERIES_OOL(0xf60, tm_unavailable)
+       STD_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
        KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf60)
+       STD_EXCEPTION_HV_OOL(0xf82, facility_unavailable)
+       KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xf82)
 
 /*
  * An interrupt came in while soft-disabled. We set paca->irq_happened, then:
@@ -793,14 +802,10 @@ system_call_relon_pSeries:
        STD_RELON_EXCEPTION_PSERIES(0x4d00, 0xd00, single_step)
 
        . = 0x4e00
-       SET_SCRATCH0(r13)
-       EXCEPTION_PROLOG_0(PACA_EXGEN)
-       b       h_data_storage_relon_hv
+       b       .       /* Can't happen, see v2.07 Book III-S section 6.5 */
 
        . = 0x4e20
-       SET_SCRATCH0(r13)
-       EXCEPTION_PROLOG_0(PACA_EXGEN)
-       b       h_instr_storage_relon_hv
+       b       .       /* Can't happen, see v2.07 Book III-S section 6.5 */
 
        . = 0x4e40
        SET_SCRATCH0(r13)
@@ -808,9 +813,7 @@ system_call_relon_pSeries:
        b       emulation_assist_relon_hv
 
        . = 0x4e60
-       SET_SCRATCH0(r13)
-       EXCEPTION_PROLOG_0(PACA_EXGEN)
-       b       hmi_exception_relon_hv
+       b       .       /* Can't happen, see v2.07 Book III-S section 6.5 */
 
        . = 0x4e80
        SET_SCRATCH0(r13)
@@ -835,11 +838,17 @@ vsx_unavailable_relon_pSeries_1:
        EXCEPTION_PROLOG_0(PACA_EXGEN)
        b       vsx_unavailable_relon_pSeries
 
-tm_unavailable_relon_pSeries_1:
+facility_unavailable_relon_trampoline:
        . = 0x4f60
        SET_SCRATCH0(r13)
        EXCEPTION_PROLOG_0(PACA_EXGEN)
-       b       tm_unavailable_relon_pSeries
+       b       facility_unavailable_relon_pSeries
+
+hv_facility_unavailable_relon_trampoline:
+       . = 0x4f80
+       SET_SCRATCH0(r13)
+       EXCEPTION_PROLOG_0(PACA_EXGEN)
+       b       facility_unavailable_relon_hv
 
        STD_RELON_EXCEPTION_PSERIES(0x5300, 0x1300, instruction_breakpoint)
 #ifdef CONFIG_PPC_DENORMALISATION
@@ -1165,36 +1174,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
        bl      .vsx_unavailable_exception
        b       .ret_from_except
 
-       .align  7
-       .globl tm_unavailable_common
-tm_unavailable_common:
-       EXCEPTION_PROLOG_COMMON(0xf60, PACA_EXGEN)
-       bl      .save_nvgprs
-       DISABLE_INTS
-       addi    r3,r1,STACK_FRAME_OVERHEAD
-       bl      .tm_unavailable_exception
-       b       .ret_from_except
+       STD_EXCEPTION_COMMON(0xf60, facility_unavailable, .facility_unavailable_exception)
 
        .align  7
        .globl  __end_handlers
 __end_handlers:
 
        /* Equivalents to the above handlers for relocation-on interrupt vectors */
-       STD_RELON_EXCEPTION_HV_OOL(0xe00, h_data_storage)
-       KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe00)
-       STD_RELON_EXCEPTION_HV_OOL(0xe20, h_instr_storage)
-       KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe20)
        STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
-       KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe40)
-       STD_RELON_EXCEPTION_HV_OOL(0xe60, hmi_exception)
-       KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe60)
        MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell)
-       KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe80)
 
        STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
        STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
        STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
-       STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, tm_unavailable)
+       STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
+       STD_RELON_EXCEPTION_HV_OOL(0xf80, facility_unavailable)
 
 #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
 /*
index a949bdfc9623b5bd9bbfca9dfdc0ec2b37214417..1150ae7c22c310f9307ba5867775e37e9560092e 100644 (file)
@@ -250,6 +250,7 @@ int __kprobes hw_breakpoint_handler(struct die_args *args)
         * we still need to single-step the instruction, but we don't
         * generate an event.
         */
+       info->type &= ~HW_BRK_TYPE_EXTRANEOUS_IRQ;
        if (!((bp->attr.bp_addr <= dar) &&
              (dar - bp->attr.bp_addr < bp->attr.bp_len)))
                info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ;
index 939ea7ef0dc8063615ed9e1a2b5ca63626064d9e..d7216c9abda15110fe903e92a028840fc8131ff3 100644 (file)
@@ -85,7 +85,7 @@ int powersave_nap;
 /*
  * Register the sysctl to set/clear powersave_nap.
  */
-static ctl_table powersave_nap_ctl_table[]={
+static struct ctl_table powersave_nap_ctl_table[] = {
        {
                .procname       = "powersave-nap",
                .data           = &powersave_nap,
@@ -95,7 +95,7 @@ static ctl_table powersave_nap_ctl_table[]={
        },
        {}
 };
-static ctl_table powersave_nap_sysctl_root[] = {
+static struct ctl_table powersave_nap_sysctl_root[] = {
        {
                .procname       = "kernel",
                .mode           = 0555,
index 50e90b7e713993109022db1f7b4d4bf0905ded1f..fa0b54b2a362b737ac6d013326c7eee8071c617f 100644 (file)
@@ -55,6 +55,7 @@ static struct iowa_bus *iowa_pci_find(unsigned long vaddr, unsigned long paddr)
 
 struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
 {
+       unsigned hugepage_shift;
        struct iowa_bus *bus;
        int token;
 
@@ -70,11 +71,17 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
                if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END)
                        return NULL;
 
-               ptep = find_linux_pte(init_mm.pgd, vaddr);
+               ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr,
+                                                &hugepage_shift);
                if (ptep == NULL)
                        paddr = 0;
-               else
+               else {
+                       /*
+                        * we don't have hugepages backing iomem
+                        */
+                       WARN_ON(hugepage_shift);
                        paddr = pte_pfn(*ptep) << PAGE_SHIFT;
+               }
                bus = iowa_pci_find(vaddr, paddr);
 
                if (bus == NULL)
index c0d0dbddfba1bfa2a6113b36b266f83bf3816e71..b20ff173a6712e88dfa1608de581fbf1d04072e0 100644 (file)
@@ -36,6 +36,8 @@
 #include <linux/hash.h>
 #include <linux/fault-inject.h>
 #include <linux/pci.h>
+#include <linux/iommu.h>
+#include <linux/sched.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/iommu.h>
@@ -44,6 +46,7 @@
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
@@ -724,6 +727,13 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
        if (tbl->it_offset == 0)
                clear_bit(0, tbl->it_map);
 
+#ifdef CONFIG_IOMMU_API
+       if (tbl->it_group) {
+               iommu_group_put(tbl->it_group);
+               BUG_ON(tbl->it_group);
+       }
+#endif
+
        /* verify that table contains no entries */
        if (!bitmap_empty(tbl->it_map, tbl->it_size))
                pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name);
@@ -860,3 +870,316 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
                free_pages((unsigned long)vaddr, get_order(size));
        }
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static void group_release(void *iommu_data)
+{
+       struct iommu_table *tbl = iommu_data;
+       tbl->it_group = NULL;
+}
+
+void iommu_register_group(struct iommu_table *tbl,
+               int pci_domain_number, unsigned long pe_num)
+{
+       struct iommu_group *grp;
+       char *name;
+
+       grp = iommu_group_alloc();
+       if (IS_ERR(grp)) {
+               pr_warn("powerpc iommu api: cannot create new group, err=%ld\n",
+                               PTR_ERR(grp));
+               return;
+       }
+       tbl->it_group = grp;
+       iommu_group_set_iommudata(grp, tbl, group_release);
+       name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
+                       pci_domain_number, pe_num);
+       if (!name)
+               return;
+       iommu_group_set_name(grp, name);
+       kfree(name);
+}
+
+enum dma_data_direction iommu_tce_direction(unsigned long tce)
+{
+       if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE))
+               return DMA_BIDIRECTIONAL;
+       else if (tce & TCE_PCI_READ)
+               return DMA_TO_DEVICE;
+       else if (tce & TCE_PCI_WRITE)
+               return DMA_FROM_DEVICE;
+       else
+               return DMA_NONE;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_direction);
+
+void iommu_flush_tce(struct iommu_table *tbl)
+{
+       /* Flush/invalidate TLB caches if necessary */
+       if (ppc_md.tce_flush)
+               ppc_md.tce_flush(tbl);
+
+       /* Make sure updates are seen by hardware */
+       mb();
+}
+EXPORT_SYMBOL_GPL(iommu_flush_tce);
+
+int iommu_tce_clear_param_check(struct iommu_table *tbl,
+               unsigned long ioba, unsigned long tce_value,
+               unsigned long npages)
+{
+       /* ppc_md.tce_free() does not support any value but 0 */
+       if (tce_value)
+               return -EINVAL;
+
+       if (ioba & ~IOMMU_PAGE_MASK)
+               return -EINVAL;
+
+       ioba >>= IOMMU_PAGE_SHIFT;
+       if (ioba < tbl->it_offset)
+               return -EINVAL;
+
+       if ((ioba + npages) > (tbl->it_offset + tbl->it_size))
+               return -EINVAL;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check);
+
+int iommu_tce_put_param_check(struct iommu_table *tbl,
+               unsigned long ioba, unsigned long tce)
+{
+       if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+               return -EINVAL;
+
+       if (tce & ~(IOMMU_PAGE_MASK | TCE_PCI_WRITE | TCE_PCI_READ))
+               return -EINVAL;
+
+       if (ioba & ~IOMMU_PAGE_MASK)
+               return -EINVAL;
+
+       ioba >>= IOMMU_PAGE_SHIFT;
+       if (ioba < tbl->it_offset)
+               return -EINVAL;
+
+       if ((ioba + 1) > (tbl->it_offset + tbl->it_size))
+               return -EINVAL;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_put_param_check);
+
+unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
+{
+       unsigned long oldtce;
+       struct iommu_pool *pool = get_pool(tbl, entry);
+
+       spin_lock(&(pool->lock));
+
+       oldtce = ppc_md.tce_get(tbl, entry);
+       if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))
+               ppc_md.tce_free(tbl, entry, 1);
+       else
+               oldtce = 0;
+
+       spin_unlock(&(pool->lock));
+
+       return oldtce;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tce);
+
+int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
+               unsigned long entry, unsigned long pages)
+{
+       unsigned long oldtce;
+       struct page *page;
+
+       for ( ; pages; --pages, ++entry) {
+               oldtce = iommu_clear_tce(tbl, entry);
+               if (!oldtce)
+                       continue;
+
+               page = pfn_to_page(oldtce >> PAGE_SHIFT);
+               WARN_ON(!page);
+               if (page) {
+                       if (oldtce & TCE_PCI_WRITE)
+                               SetPageDirty(page);
+                       put_page(page);
+               }
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
+
+/*
+ * hwaddr is a kernel virtual address here (0xc... bazillion),
+ * tce_build converts it to a physical address.
+ */
+int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
+               unsigned long hwaddr, enum dma_data_direction direction)
+{
+       int ret = -EBUSY;
+       unsigned long oldtce;
+       struct iommu_pool *pool = get_pool(tbl, entry);
+
+       spin_lock(&(pool->lock));
+
+       oldtce = ppc_md.tce_get(tbl, entry);
+       /* Add new entry if it is not busy */
+       if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+               ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, direction, NULL);
+
+       spin_unlock(&(pool->lock));
+
+       /* if (unlikely(ret))
+               pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n",
+                               __func__, hwaddr, entry << IOMMU_PAGE_SHIFT,
+                               hwaddr, ret); */
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_build);
+
+int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
+               unsigned long tce)
+{
+       int ret;
+       struct page *page = NULL;
+       unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK & ~PAGE_MASK;
+       enum dma_data_direction direction = iommu_tce_direction(tce);
+
+       ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+                       direction != DMA_TO_DEVICE, &page);
+       if (unlikely(ret != 1)) {
+               /* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n",
+                               tce, entry << IOMMU_PAGE_SHIFT, ret); */
+               return -EFAULT;
+       }
+       hwaddr = (unsigned long) page_address(page) + offset;
+
+       ret = iommu_tce_build(tbl, entry, hwaddr, direction);
+       if (ret)
+               put_page(page);
+
+       if (ret < 0)
+               pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n",
+                               __func__, entry << IOMMU_PAGE_SHIFT, tce, ret);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode);
+
+int iommu_take_ownership(struct iommu_table *tbl)
+{
+       unsigned long sz = (tbl->it_size + 7) >> 3;
+
+       if (tbl->it_offset == 0)
+               clear_bit(0, tbl->it_map);
+
+       if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
+               pr_err("iommu_tce: it_map is not empty");
+               return -EBUSY;
+       }
+
+       memset(tbl->it_map, 0xff, sz);
+       iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_take_ownership);
+
+void iommu_release_ownership(struct iommu_table *tbl)
+{
+       unsigned long sz = (tbl->it_size + 7) >> 3;
+
+       iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
+       memset(tbl->it_map, 0, sz);
+
+       /* Restore bit#0 set by iommu_init_table() */
+       if (tbl->it_offset == 0)
+               set_bit(0, tbl->it_map);
+}
+EXPORT_SYMBOL_GPL(iommu_release_ownership);
+
+static int iommu_add_device(struct device *dev)
+{
+       struct iommu_table *tbl;
+       int ret = 0;
+
+       if (WARN_ON(dev->iommu_group)) {
+               pr_warn("iommu_tce: device %s is already in iommu group %d, skipping\n",
+                               dev_name(dev),
+                               iommu_group_id(dev->iommu_group));
+               return -EBUSY;
+       }
+
+       tbl = get_iommu_table_base(dev);
+       if (!tbl || !tbl->it_group) {
+               pr_debug("iommu_tce: skipping device %s with no tbl\n",
+                               dev_name(dev));
+               return 0;
+       }
+
+       pr_debug("iommu_tce: adding %s to iommu group %d\n",
+                       dev_name(dev), iommu_group_id(tbl->it_group));
+
+       ret = iommu_group_add_device(tbl->it_group, dev);
+       if (ret < 0)
+               pr_err("iommu_tce: %s has not been added, ret=%d\n",
+                               dev_name(dev), ret);
+
+       return ret;
+}
+
+static void iommu_del_device(struct device *dev)
+{
+       iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+                             unsigned long action, void *data)
+{
+       struct device *dev = data;
+
+       switch (action) {
+       case BUS_NOTIFY_ADD_DEVICE:
+               return iommu_add_device(dev);
+       case BUS_NOTIFY_DEL_DEVICE:
+               iommu_del_device(dev);
+               return 0;
+       default:
+               return 0;
+       }
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+       .notifier_call = iommu_bus_notifier,
+};
+
+static int __init tce_iommu_init(void)
+{
+       struct pci_dev *pdev = NULL;
+
+       BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+
+       for_each_pci_dev(pdev)
+               iommu_add_device(&pdev->dev);
+
+       bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+       return 0;
+}
+
+subsys_initcall_sync(tce_iommu_init);
+
+#else
+
+void iommu_register_group(struct iommu_table *tbl,
+               int pci_domain_number, unsigned long pe_num)
+{
+}
+
+#endif /* CONFIG_IOMMU_API */
index ea185e0b3cae5800145b0e3ee0388534369f3686..2e51cde616d2de1fc93cdb75547debab72cf66c2 100644 (file)
@@ -116,8 +116,6 @@ static inline notrace int decrementer_check_overflow(void)
        u64 now = get_tb_or_rtc();
        u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
  
-       if (now >= *next_tb)
-               set_dec(1);
        return now >= *next_tb;
 }
 
index 11f5b03a0b06eebc4567d20acf1bc5a5aa9e6545..2156ea90eb54181c84bc5d1dff48b0de80d5e405 100644 (file)
 #include <asm/sstep.h>
 #include <asm/uaccess.h>
 
-#ifdef CONFIG_PPC_ADV_DEBUG_REGS
-#define MSR_SINGLESTEP (MSR_DE)
-#else
-#define MSR_SINGLESTEP (MSR_SE)
-#endif
-
 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
 DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
@@ -104,19 +98,7 @@ void __kprobes arch_remove_kprobe(struct kprobe *p)
 
 static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 {
-       /* We turn off async exceptions to ensure that the single step will
-        * be for the instruction we have the kprobe on, if we dont its
-        * possible we'd get the single step reported for an exception handler
-        * like Decrementer or External Interrupt */
-       regs->msr &= ~MSR_EE;
-       regs->msr |= MSR_SINGLESTEP;
-#ifdef CONFIG_PPC_ADV_DEBUG_REGS
-       regs->msr &= ~MSR_CE;
-       mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM);
-#ifdef CONFIG_PPC_47x
-       isync();
-#endif
-#endif
+       enable_single_step(regs);
 
        /*
         * On powerpc we should single step on the original
index 48fbc2b97e952114c45162515ed211a9d40be40a..8213ee1eb05abc39e8da6502ec565b4a38180e7e 100644 (file)
@@ -84,22 +84,30 @@ static ssize_t dev_nvram_read(struct file *file, char __user *buf,
        char *tmp = NULL;
        ssize_t size;
 
-       ret = -ENODEV;
-       if (!ppc_md.nvram_size)
+       if (!ppc_md.nvram_size) {
+               ret = -ENODEV;
                goto out;
+       }
 
-       ret = 0;
        size = ppc_md.nvram_size();
-       if (*ppos >= size || size < 0)
+       if (size < 0) {
+               ret = size;
+               goto out;
+       }
+
+       if (*ppos >= size) {
+               ret = 0;
                goto out;
+       }
 
        count = min_t(size_t, count, size - *ppos);
        count = min(count, PAGE_SIZE);
 
-       ret = -ENOMEM;
        tmp = kmalloc(count, GFP_KERNEL);
-       if (!tmp)
+       if (!tmp) {
+               ret = -ENOMEM;
                goto out;
+       }
 
        ret = ppc_md.nvram_read(tmp, count, ppos);
        if (ret <= 0)
diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
new file mode 100644 (file)
index 0000000..3f60880
--- /dev/null
@@ -0,0 +1,111 @@
+/*
+ * Derived from "arch/powerpc/platforms/pseries/pci_dlpar.c"
+ *
+ * Copyright (C) 2003 Linda Xie <lxie@us.ibm.com>
+ * Copyright (C) 2005 International Business Machines
+ *
+ * Updates, 2005, John Rose <johnrose@austin.ibm.com>
+ * Updates, 2005, Linas Vepstas <linas@austin.ibm.com>
+ * Updates, 2013, Gavin Shan <shangw@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/pci.h>
+#include <linux/export.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/firmware.h>
+#include <asm/eeh.h>
+
+/**
+ * __pcibios_remove_pci_devices - remove all devices under this bus
+ * @bus: the indicated PCI bus
+ * @purge_pe: destroy the PE on removal of PCI devices
+ *
+ * Remove all of the PCI devices under this bus both from the
+ * linux pci device tree, and from the powerpc EEH address cache.
+ * By default, the corresponding PE will be destroied during the
+ * normal PCI hotplug path. For PCI hotplug during EEH recovery,
+ * the corresponding PE won't be destroied and deallocated.
+ */
+void __pcibios_remove_pci_devices(struct pci_bus *bus, int purge_pe)
+{
+       struct pci_dev *dev, *tmp;
+       struct pci_bus *child_bus;
+
+       /* First go down child busses */
+       list_for_each_entry(child_bus, &bus->children, node)
+               __pcibios_remove_pci_devices(child_bus, purge_pe);
+
+       pr_debug("PCI: Removing devices on bus %04x:%02x\n",
+                pci_domain_nr(bus),  bus->number);
+       list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) {
+               pr_debug("     * Removing %s...\n", pci_name(dev));
+               eeh_remove_bus_device(dev, purge_pe);
+               pci_stop_and_remove_bus_device(dev);
+       }
+}
+
+/**
+ * pcibios_remove_pci_devices - remove all devices under this bus
+ * @bus: the indicated PCI bus
+ *
+ * Remove all of the PCI devices under this bus both from the
+ * linux pci device tree, and from the powerpc EEH address cache.
+ */
+void pcibios_remove_pci_devices(struct pci_bus *bus)
+{
+       __pcibios_remove_pci_devices(bus, 1);
+}
+EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices);
+
+/**
+ * pcibios_add_pci_devices - adds new pci devices to bus
+ * @bus: the indicated PCI bus
+ *
+ * This routine will find and fixup new pci devices under
+ * the indicated bus. This routine presumes that there
+ * might already be some devices under this bridge, so
+ * it carefully tries to add only new devices.  (And that
+ * is how this routine differs from other, similar pcibios
+ * routines.)
+ */
+void pcibios_add_pci_devices(struct pci_bus * bus)
+{
+       int slotno, num, mode, pass, max;
+       struct pci_dev *dev;
+       struct device_node *dn = pci_bus_to_OF_node(bus);
+
+       eeh_add_device_tree_early(dn);
+
+       mode = PCI_PROBE_NORMAL;
+       if (ppc_md.pci_probe_mode)
+               mode = ppc_md.pci_probe_mode(bus);
+
+       if (mode == PCI_PROBE_DEVTREE) {
+               /* use ofdt-based probe */
+               of_rescan_bus(dn, bus);
+       } else if (mode == PCI_PROBE_NORMAL) {
+               /* use legacy probe */
+               slotno = PCI_SLOT(PCI_DN(dn->child)->devfn);
+               num = pci_scan_slot(bus, PCI_DEVFN(slotno, 0));
+               if (!num)
+                       return;
+               pcibios_setup_bus_devices(bus);
+               max = bus->busn_res.start;
+               for (pass = 0; pass < 2; pass++) {
+                       list_for_each_entry(dev, &bus->devices, bus_list) {
+                               if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
+                                   dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
+                                       max = pci_scan_bridge(bus, dev,
+                                                             max, pass);
+                       }
+               }
+       }
+       pcibios_finish_adding_to_bus(bus);
+}
+EXPORT_SYMBOL_GPL(pcibios_add_pci_devices);
index 076d1242507a7fdcf32c374b66b3b005eb4e0db9..c517dbe705fdd5fd5944ae693c04aaaf52510e85 100644 (file)
@@ -916,7 +916,11 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
        flush_altivec_to_thread(src);
        flush_vsx_to_thread(src);
        flush_spe_to_thread(src);
+
        *dst = *src;
+
+       clear_task_ebb(dst);
+
        return 0;
 }
 
index 8b6f7a99cce2ba1af5e8a38a15b515d78804735f..9c753bc9885d9dd1d7728ce2a3de9baa2de04058 100644 (file)
@@ -559,6 +559,33 @@ void __init early_init_dt_setup_initrd_arch(unsigned long start,
 }
 #endif
 
+static bool __init early_reserve_mem_dt(void)
+{
+       unsigned long i, len, dt_root;
+       const __be32 *prop;
+
+       dt_root = of_get_flat_dt_root();
+
+       prop = of_get_flat_dt_prop(dt_root, "reserved-ranges", &len);
+
+       if (!prop)
+               return false;
+
+       /* Each reserved range is an (address,size) pair, 2 cells each,
+        * totalling 4 cells per range. */
+       for (i = 0; i < len / (sizeof(*prop) * 4); i++) {
+               u64 base, size;
+
+               base = of_read_number(prop + (i * 4) + 0, 2);
+               size = of_read_number(prop + (i * 4) + 2, 2);
+
+               if (size)
+                       memblock_reserve(base, size);
+       }
+
+       return true;
+}
+
 static void __init early_reserve_mem(void)
 {
        u64 base, size;
@@ -574,6 +601,14 @@ static void __init early_reserve_mem(void)
        self_size = initial_boot_params->totalsize;
        memblock_reserve(self_base, self_size);
 
+       /*
+        * Try looking for reserved-regions property in the DT first; if
+        * it's present, it'll contain all of the necessary reservation
+        * info
+        */
+       if (early_reserve_mem_dt())
+               return;
+
 #ifdef CONFIG_BLK_DEV_INITRD
        /* then reserve the initrd, if any */
        if (initrd_start && (initrd_end > initrd_start))
index 98c2fc198712aabe3f90055e24d653df604be78e..64f7bd5b1b0f59bcf23fa47c22eb012a5a440b16 100644 (file)
@@ -1449,7 +1449,9 @@ static long ppc_set_hwdebug(struct task_struct *child,
         */
        if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) {
                len = bp_info->addr2 - bp_info->addr;
-       } else if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT) {
+       } else if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT)
+               len = 1;
+       else {
                ptrace_put_breakpoints(child);
                return -EINVAL;
        }
index ef46ba6e094fa6f017d1e0ae38711e62dded190f..f366fedb08723036187657446bb1ac15d570d920 100644 (file)
@@ -166,7 +166,7 @@ ha16:
        /* R_PPC_ADDR16_LO */
 lo16:
        cmpwi   r4, R_PPC_ADDR16_LO
-       bne     nxtrela
+       bne     unknown_type
        lwz     r4, 0(r9)       /* r_offset */
        lwz     r0, 8(r9)       /* r_addend */
        add     r0, r0, r3
@@ -191,6 +191,7 @@ nxtrela:
        dcbst   r4,r7
        sync                    /* Ensure the data is flushed before icbi */
        icbi    r4,r7
+unknown_type:
        cmpwi   r8, 0           /* relasz = 0 ? */
        ble     done
        add     r9, r9, r6      /* move to next entry in the .rela table */
index 52add6f3e201e1c196ea493f40a8fccc2cc922bc..80b5ef403f685d177c0c974eaaf443b615cd8c75 100644 (file)
@@ -1172,7 +1172,7 @@ int __init early_init_dt_scan_rtas(unsigned long node,
 static arch_spinlock_t timebase_lock;
 static u64 timebase = 0;
 
-void __cpuinit rtas_give_timebase(void)
+void rtas_give_timebase(void)
 {
        unsigned long flags;
 
@@ -1189,7 +1189,7 @@ void __cpuinit rtas_give_timebase(void)
        local_irq_restore(flags);
 }
 
-void __cpuinit rtas_take_timebase(void)
+void rtas_take_timebase(void)
 {
        while (!timebase)
                barrier();
index e379d3fd16948cff85732eb5287dfc49557e9edc..389fb8077cc9cea25746b12497673dd573d35c56 100644 (file)
@@ -76,7 +76,7 @@
 #endif
 
 int boot_cpuid = 0;
-int __initdata spinning_secondaries;
+int spinning_secondaries;
 u64 ppc64_pft_size;
 
 /* Pick defaults since we might want to patch instructions
index 201385c3a1ae186f9de8102d0f5a8d645c36a6d4..0f83122e6676cf8dd8692d3e64f72a2c961f34bb 100644 (file)
@@ -407,7 +407,8 @@ inline unsigned long copy_transact_fpr_from_user(struct task_struct *task,
  * altivec/spe instructions at some point.
  */
 static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
-               int sigret, int ctx_has_vsx_region)
+                         struct mcontext __user *tm_frame, int sigret,
+                         int ctx_has_vsx_region)
 {
        unsigned long msr = regs->msr;
 
@@ -475,6 +476,12 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
 
        if (__put_user(msr, &frame->mc_gregs[PT_MSR]))
                return 1;
+       /* We need to write 0 the MSR top 32 bits in the tm frame so that we
+        * can check it on the restore to see if TM is active
+        */
+       if (tm_frame && __put_user(0, &tm_frame->mc_gregs[PT_MSR]))
+               return 1;
+
        if (sigret) {
                /* Set up the sigreturn trampoline: li r0,sigret; sc */
                if (__put_user(0x38000000UL + sigret, &frame->tramp[0])
@@ -747,7 +754,7 @@ static long restore_tm_user_regs(struct pt_regs *regs,
                                 struct mcontext __user *tm_sr)
 {
        long err;
-       unsigned long msr;
+       unsigned long msr, msr_hi;
 #ifdef CONFIG_VSX
        int i;
 #endif
@@ -852,8 +859,11 @@ static long restore_tm_user_regs(struct pt_regs *regs,
        tm_enable();
        /* This loads the checkpointed FP/VEC state, if used */
        tm_recheckpoint(&current->thread, msr);
-       /* The task has moved into TM state S, so ensure MSR reflects this */
-       regs->msr = (regs->msr & ~MSR_TS_MASK) | MSR_TS_S;
+       /* Get the top half of the MSR */
+       if (__get_user(msr_hi, &tm_sr->mc_gregs[PT_MSR]))
+               return 1;
+       /* Pull in MSR TM from user context */
+       regs->msr = (regs->msr & ~MSR_TS_MASK) | ((msr_hi<<32) & MSR_TS_MASK);
 
        /* This loads the speculative FP/VEC state, if used */
        if (msr & MSR_FP) {
@@ -952,6 +962,7 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
 {
        struct rt_sigframe __user *rt_sf;
        struct mcontext __user *frame;
+       struct mcontext __user *tm_frame = NULL;
        void __user *addr;
        unsigned long newsp = 0;
        int sigret;
@@ -985,23 +996,24 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
        }
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       tm_frame = &rt_sf->uc_transact.uc_mcontext;
        if (MSR_TM_ACTIVE(regs->msr)) {
-               if (save_tm_user_regs(regs, &rt_sf->uc.uc_mcontext,
-                                     &rt_sf->uc_transact.uc_mcontext, sigret))
+               if (save_tm_user_regs(regs, frame, tm_frame, sigret))
                        goto badframe;
        }
        else
 #endif
-               if (save_user_regs(regs, frame, sigret, 1))
+       {
+               if (save_user_regs(regs, frame, tm_frame, sigret, 1))
                        goto badframe;
+       }
        regs->link = tramp;
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
        if (MSR_TM_ACTIVE(regs->msr)) {
                if (__put_user((unsigned long)&rt_sf->uc_transact,
                               &rt_sf->uc.uc_link)
-                   || __put_user(to_user_ptr(&rt_sf->uc_transact.uc_mcontext),
-                                 &rt_sf->uc_transact.uc_regs))
+                   || __put_user((unsigned long)tm_frame, &rt_sf->uc_transact.uc_regs))
                        goto badframe;
        }
        else
@@ -1170,7 +1182,7 @@ long sys_swapcontext(struct ucontext __user *old_ctx,
                mctx = (struct mcontext __user *)
                        ((unsigned long) &old_ctx->uc_mcontext & ~0xfUL);
                if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size)
-                   || save_user_regs(regs, mctx, 0, ctx_has_vsx_region)
+                   || save_user_regs(regs, mctx, NULL, 0, ctx_has_vsx_region)
                    || put_sigset_t(&old_ctx->uc_sigmask, &current->blocked)
                    || __put_user(to_user_ptr(mctx), &old_ctx->uc_regs))
                        return -EFAULT;
@@ -1233,7 +1245,7 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
                if (__get_user(msr_hi, &mcp->mc_gregs[PT_MSR]))
                        goto bad;
 
-               if (MSR_TM_SUSPENDED(msr_hi<<32)) {
+               if (MSR_TM_ACTIVE(msr_hi<<32)) {
                        /* We only recheckpoint on return if we're
                         * transaction.
                         */
@@ -1392,6 +1404,7 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka,
 {
        struct sigcontext __user *sc;
        struct sigframe __user *frame;
+       struct mcontext __user *tm_mctx = NULL;
        unsigned long newsp = 0;
        int sigret;
        unsigned long tramp;
@@ -1425,6 +1438,7 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka,
        }
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       tm_mctx = &frame->mctx_transact;
        if (MSR_TM_ACTIVE(regs->msr)) {
                if (save_tm_user_regs(regs, &frame->mctx, &frame->mctx_transact,
                                      sigret))
@@ -1432,8 +1446,10 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka,
        }
        else
 #endif
-               if (save_user_regs(regs, &frame->mctx, sigret, 1))
+       {
+               if (save_user_regs(regs, &frame->mctx, tm_mctx, sigret, 1))
                        goto badframe;
+       }
 
        regs->link = tramp;
 
@@ -1481,16 +1497,22 @@ badframe:
 long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
                       struct pt_regs *regs)
 {
+       struct sigframe __user *sf;
        struct sigcontext __user *sc;
        struct sigcontext sigctx;
        struct mcontext __user *sr;
        void __user *addr;
        sigset_t set;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       struct mcontext __user *mcp, *tm_mcp;
+       unsigned long msr_hi;
+#endif
 
        /* Always make any pending restarted system calls return -EINTR */
        current_thread_info()->restart_block.fn = do_no_restart_syscall;
 
-       sc = (struct sigcontext __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE);
+       sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE);
+       sc = &sf->sctx;
        addr = sc;
        if (copy_from_user(&sigctx, sc, sizeof(sigctx)))
                goto badframe;
@@ -1507,11 +1529,25 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
 #endif
        set_current_blocked(&set);
 
-       sr = (struct mcontext __user *)from_user_ptr(sigctx.regs);
-       addr = sr;
-       if (!access_ok(VERIFY_READ, sr, sizeof(*sr))
-           || restore_user_regs(regs, sr, 1))
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       mcp = (struct mcontext __user *)&sf->mctx;
+       tm_mcp = (struct mcontext __user *)&sf->mctx_transact;
+       if (__get_user(msr_hi, &tm_mcp->mc_gregs[PT_MSR]))
                goto badframe;
+       if (MSR_TM_ACTIVE(msr_hi<<32)) {
+               if (!cpu_has_feature(CPU_FTR_TM))
+                       goto badframe;
+               if (restore_tm_user_regs(regs, mcp, tm_mcp))
+                       goto badframe;
+       } else
+#endif
+       {
+               sr = (struct mcontext __user *)from_user_ptr(sigctx.regs);
+               addr = sr;
+               if (!access_ok(VERIFY_READ, sr, sizeof(*sr))
+                   || restore_user_regs(regs, sr, 1))
+                       goto badframe;
+       }
 
        set_thread_flag(TIF_RESTOREALL);
        return 0;
index 345947367ec00a4fa440e162005864d9708eae6a..887e99d85bc270eefeb949d30dcf4d45fd3d3647 100644 (file)
@@ -410,6 +410,10 @@ static long restore_tm_sigcontexts(struct pt_regs *regs,
 
        /* get MSR separately, transfer the LE bit if doing signal return */
        err |= __get_user(msr, &sc->gp_regs[PT_MSR]);
+       /* pull in MSR TM from user context */
+       regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK);
+
+       /* pull in MSR LE from user context */
        regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
 
        /* The following non-GPR non-FPR non-VR state is also checkpointed: */
@@ -505,8 +509,6 @@ static long restore_tm_sigcontexts(struct pt_regs *regs,
        tm_enable();
        /* This loads the checkpointed FP/VEC state, if used */
        tm_recheckpoint(&current->thread, msr);
-       /* The task has moved into TM state S, so ensure MSR reflects this: */
-       regs->msr = (regs->msr & ~MSR_TS_MASK) | __MASK(33);
 
        /* This loads the speculative FP/VEC state, if used */
        if (msr & MSR_FP) {
@@ -654,7 +656,7 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5,
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
        if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR]))
                goto badframe;
-       if (MSR_TM_SUSPENDED(msr)) {
+       if (MSR_TM_ACTIVE(msr)) {
                /* We recheckpoint on return. */
                struct ucontext __user *uc_transact;
                if (__get_user(uc_transact, &uc->uc_link))
index ee7ac5e6e28ac82a1693b24b7c0857a2e26e94ea..38b0ba65a73566238c3e4dbf92bc9f9d33bdf83d 100644 (file)
@@ -480,7 +480,7 @@ static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle)
        secondary_ti = current_set[cpu] = ti;
 }
 
-int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
+int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
        int rc, c;
 
@@ -610,7 +610,7 @@ static struct device_node *cpu_to_l2cache(int cpu)
 }
 
 /* Activate a secondary processor. */
-__cpuinit void start_secondary(void *unused)
+void start_secondary(void *unused)
 {
        unsigned int cpu = smp_processor_id();
        struct device_node *l2_cache;
@@ -637,12 +637,10 @@ __cpuinit void start_secondary(void *unused)
 
        vdso_getcpu_init();
 #endif
-       notify_cpu_starting(cpu);
-       set_cpu_online(cpu, true);
        /* Update sibling maps */
        base = cpu_first_thread_sibling(cpu);
        for (i = 0; i < threads_per_core; i++) {
-               if (cpu_is_offline(base + i))
+               if (cpu_is_offline(base + i) && (cpu != base + i))
                        continue;
                cpumask_set_cpu(cpu, cpu_sibling_mask(base + i));
                cpumask_set_cpu(base + i, cpu_sibling_mask(cpu));
@@ -667,6 +665,10 @@ __cpuinit void start_secondary(void *unused)
        }
        of_node_put(l2_cache);
 
+       smp_wmb();
+       notify_cpu_starting(cpu);
+       set_cpu_online(cpu, true);
+
        local_irq_enable();
 
        cpu_startup_entry(CPUHP_ONLINE);
index e68a84568b8bcbce38815dff9789de604701b185..27a90b99ef6744d20fa6664bb1c6244626a5cd0d 100644 (file)
@@ -341,7 +341,7 @@ static struct device_attribute pa6t_attrs[] = {
 #endif /* HAS_PPC_PMC_PA6T */
 #endif /* HAS_PPC_PMC_CLASSIC */
 
-static void __cpuinit register_cpu_online(unsigned int cpu)
+static void register_cpu_online(unsigned int cpu)
 {
        struct cpu *c = &per_cpu(cpu_devices, cpu);
        struct device *s = &c->dev;
@@ -502,7 +502,7 @@ ssize_t arch_cpu_release(const char *buf, size_t count)
 
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int __cpuinit sysfs_cpu_notify(struct notifier_block *self,
+static int sysfs_cpu_notify(struct notifier_block *self,
                                      unsigned long action, void *hcpu)
 {
        unsigned int cpu = (unsigned int)(long)hcpu;
@@ -522,7 +522,7 @@ static int __cpuinit sysfs_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata sysfs_cpu_nb = {
+static struct notifier_block sysfs_cpu_nb = {
        .notifier_call  = sysfs_cpu_notify,
 };
 
index 5fc29ad7e26fad673cd1d6732d9f6fd5586a42c7..65ab9e9093772e63d74e8c9b22ef00089718d6e8 100644 (file)
@@ -631,7 +631,6 @@ static int __init get_freq(char *name, int cells, unsigned long *val)
        return found;
 }
 
-/* should become __cpuinit when secondary_cpu_time_init also is */
 void start_cpu_decrementer(void)
 {
 #if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
index 2da67e7a16d58650e10a0b6113b705bd88a6f2d3..51be8fb2480313610b5971b6ef70a89f58f794a2 100644 (file)
@@ -112,9 +112,18 @@ _GLOBAL(tm_reclaim)
        std     r3, STACK_PARAM(0)(r1)
        SAVE_NVGPRS(r1)
 
+       /* We need to setup MSR for VSX register save instructions.  Here we
+        * also clear the MSR RI since when we do the treclaim, we won't have a
+        * valid kernel pointer for a while.  We clear RI here as it avoids
+        * adding another mtmsr closer to the treclaim.  This makes the region
+        * maked as non-recoverable wider than it needs to be but it saves on
+        * inserting another mtmsrd later.
+        */
        mfmsr   r14
        mr      r15, r14
        ori     r15, r15, MSR_FP
+       li      r16, MSR_RI
+       andc    r15, r15, r16
        oris    r15, r15, MSR_VEC@h
 #ifdef CONFIG_VSX
        BEGIN_FTR_SECTION
@@ -349,9 +358,10 @@ restore_gprs:
        mtcr    r5
        mtxer   r6
 
-       /* MSR and flags:  We don't change CRs, and we don't need to alter
-        * MSR.
+       /* Clear the MSR RI since we are about to change R1.  EE is already off
         */
+       li      r4, 0
+       mtmsrd  r4, 1
 
        REST_4GPRS(0, r7)                       /* GPR0-3 */
        REST_GPR(4, r7)                         /* GPR4-6 */
@@ -377,6 +387,10 @@ restore_gprs:
        GET_PACA(r13)
        GET_SCRATCH0(r1)
 
+       /* R1 is restored, so we are recoverable again.  EE is still off */
+       li      r4, MSR_RI
+       mtmsrd  r4, 1
+
        REST_NVGPRS(r1)
 
        addi    r1, r1, TM_FRAME_SIZE
index c0e5caf8ccc72c0f7624b1e7dc9c3f7ecfe4b242..bf33c22e38a40848221762497ca1a7b704a27f9f 100644 (file)
@@ -866,6 +866,10 @@ static int emulate_string_inst(struct pt_regs *regs, u32 instword)
                u8 val;
                u32 shift = 8 * (3 - (pos & 0x3));
 
+               /* if process is 32-bit, clear upper 32 bits of EA */
+               if ((regs->msr & MSR_64BIT) == 0)
+                       EA &= 0xFFFFFFFF;
+
                switch ((instword & PPC_INST_STRING_MASK)) {
                        case PPC_INST_LSWX:
                        case PPC_INST_LSWI:
@@ -1125,7 +1129,17 @@ void __kprobes program_check_exception(struct pt_regs *regs)
         * ESR_DST (!?) or 0.  In the process of chasing this with the
         * hardware people - not sure if it can happen on any illegal
         * instruction or only on FP instructions, whether there is a
-        * pattern to occurrences etc. -dgibson 31/Mar/2003 */
+        * pattern to occurrences etc. -dgibson 31/Mar/2003
+        */
+
+       /*
+        * If we support a HW FPU, we need to ensure the FP state
+        * if flushed into the thread_struct before attempting
+        * emulation
+        */
+#ifdef CONFIG_PPC_FPU
+       flush_fp_to_thread(current);
+#endif
        switch (do_mathemu(regs)) {
        case 0:
                emulate_single_step(regs);
@@ -1282,25 +1296,50 @@ void vsx_unavailable_exception(struct pt_regs *regs)
        die("Unrecoverable VSX Unavailable Exception", regs, SIGABRT);
 }
 
-void tm_unavailable_exception(struct pt_regs *regs)
+void facility_unavailable_exception(struct pt_regs *regs)
 {
+       static char *facility_strings[] = {
+               "FPU",
+               "VMX/VSX",
+               "DSCR",
+               "PMU SPRs",
+               "BHRB",
+               "TM",
+               "AT",
+               "EBB",
+               "TAR",
+       };
+       char *facility, *prefix;
+       u64 value;
+
+       if (regs->trap == 0xf60) {
+               value = mfspr(SPRN_FSCR);
+               prefix = "";
+       } else {
+               value = mfspr(SPRN_HFSCR);
+               prefix = "Hypervisor ";
+       }
+
+       value = value >> 56;
+
        /* We restore the interrupt state now */
        if (!arch_irq_disabled_regs(regs))
                local_irq_enable();
 
-       /* Currently we never expect a TMU exception.  Catch
-        * this and kill the process!
-        */
-       printk(KERN_EMERG "Unexpected TM unavailable exception at %lx "
-              "(msr %lx)\n",
-              regs->nip, regs->msr);
+       if (value < ARRAY_SIZE(facility_strings))
+               facility = facility_strings[value];
+       else
+               facility = "unknown";
+
+       pr_err("%sFacility '%s' unavailable, exception at 0x%lx, MSR=%lx\n",
+               prefix, facility, regs->nip, regs->msr);
 
        if (user_mode(regs)) {
                _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
                return;
        }
 
-       die("Unexpected TM unavailable exception", regs, SIGABRT);
+       die("Unexpected facility unavailable exception", regs, SIGABRT);
 }
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@@ -1396,8 +1435,7 @@ void performance_monitor_exception(struct pt_regs *regs)
 void SoftwareEmulation(struct pt_regs *regs)
 {
        extern int do_mathemu(struct pt_regs *);
-       extern int Soft_emulate_8xx(struct pt_regs *);
-#if defined(CONFIG_MATH_EMULATION) || defined(CONFIG_8XX_MINIMAL_FPEMU)
+#if defined(CONFIG_MATH_EMULATION)
        int errcode;
 #endif
 
@@ -1430,23 +1468,6 @@ void SoftwareEmulation(struct pt_regs *regs)
                _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
                return;
        }
-
-#elif defined(CONFIG_8XX_MINIMAL_FPEMU)
-       errcode = Soft_emulate_8xx(regs);
-       if (errcode >= 0)
-               PPC_WARN_EMULATED(8xx, regs);
-
-       switch (errcode) {
-       case 0:
-               emulate_single_step(regs);
-               return;
-       case 1:
-               _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
-               return;
-       case -EFAULT:
-               _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip);
-               return;
-       }
 #else
        _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
 #endif
@@ -1796,8 +1817,6 @@ struct ppc_emulated ppc_emulated = {
        WARN_EMULATED_SETUP(unaligned),
 #ifdef CONFIG_MATH_EMULATION
        WARN_EMULATED_SETUP(math),
-#elif defined(CONFIG_8XX_MINIMAL_FPEMU)
-       WARN_EMULATED_SETUP(8xx),
 #endif
 #ifdef CONFIG_VSX
        WARN_EMULATED_SETUP(vsx),
index 9d3fdcd66290b79c6200cd853a04eea73be1a3d0..a15837519dca45474a141fc6328eee130bafd779 100644 (file)
@@ -50,7 +50,7 @@ void __init udbg_early_init(void)
        udbg_init_debug_beat();
 #elif defined(CONFIG_PPC_EARLY_DEBUG_PAS_REALMODE)
        udbg_init_pas_realmode();
-#elif defined(CONFIG_BOOTX_TEXT)
+#elif defined(CONFIG_PPC_EARLY_DEBUG_BOOTX)
        udbg_init_btext();
 #elif defined(CONFIG_PPC_EARLY_DEBUG_44x)
        /* PPC44x debug */
index d4f463ac65b1ea6d1dd00e3f52925745f7c136e2..1d9c92621b36f4be1eeaa0dcf5ebab5c926c9259 100644 (file)
@@ -711,7 +711,7 @@ static void __init vdso_setup_syscall_map(void)
 }
 
 #ifdef CONFIG_PPC64
-int __cpuinit vdso_getcpu_init(void)
+int vdso_getcpu_init(void)
 {
        unsigned long cpu, node, val;
 
index 3a9a1aceb14f576e8201164343b7d2b220cf553a..176d3fd53b733ef7d15681af6b3691502a55bb05 100644 (file)
@@ -34,7 +34,7 @@
 void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
        ppc_md.hpte_invalidate(pte->slot, pte->host_vpn,
-                              MMU_PAGE_4K, MMU_SEGSIZE_256M,
+                              MMU_PAGE_4K, MMU_PAGE_4K, MMU_SEGSIZE_256M,
                               false);
 }
 
index 5880dfb31074895816af634620e736cd51985bc2..710d31317d812efe73086e33a89bcdced65ba349 100644 (file)
@@ -675,6 +675,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
                }
                /* if the guest wants write access, see if that is OK */
                if (!writing && hpte_is_writable(r)) {
+                       unsigned int hugepage_shift;
                        pte_t *ptep, pte;
 
                        /*
@@ -683,9 +684,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
                         */
                        rcu_read_lock_sched();
                        ptep = find_linux_pte_or_hugepte(current->mm->pgd,
-                                                        hva, NULL);
-                       if (ptep && pte_present(*ptep)) {
-                               pte = kvmppc_read_update_linux_pte(ptep, 1);
+                                                        hva, &hugepage_shift);
+                       if (ptep) {
+                               pte = kvmppc_read_update_linux_pte(ptep, 1,
+                                                          hugepage_shift);
                                if (pte_write(pte))
                                        write_ok = 1;
                        }
index 6dcbb49105a4667353ee745d4a3ea6c23db72082..fc25689a9f35076e61d83ca024a08e2bdf7564c1 100644 (file)
@@ -27,7 +27,7 @@ static void *real_vmalloc_addr(void *x)
        unsigned long addr = (unsigned long) x;
        pte_t *p;
 
-       p = find_linux_pte(swapper_pg_dir, addr);
+       p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
        if (!p || !pte_present(*p))
                return NULL;
        /* assume we don't have huge pages in vmalloc space... */
@@ -139,20 +139,18 @@ static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
 {
        pte_t *ptep;
        unsigned long ps = *pte_sizep;
-       unsigned int shift;
+       unsigned int hugepage_shift;
 
-       ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
+       ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift);
        if (!ptep)
                return __pte(0);
-       if (shift)
-               *pte_sizep = 1ul << shift;
+       if (hugepage_shift)
+               *pte_sizep = 1ul << hugepage_shift;
        else
                *pte_sizep = PAGE_SIZE;
        if (ps > *pte_sizep)
                return __pte(0);
-       if (!pte_present(*ptep))
-               return __pte(0);
-       return kvmppc_read_update_linux_pte(ptep, writing);
+       return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
 }
 
 static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v)
index e15c521846ca924291bca5b9dc923ec5275d3e69..99c7fc16dc0d3ffe7ad59a746b085bd888961f00 100644 (file)
@@ -580,7 +580,7 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
                if (instr & 1)
                        regs->link = regs->nip;
                if (branch_taken(instr, regs))
-                       regs->nip = imm;
+                       regs->nip = truncate_if_32bit(regs->msr, imm);
                return 1;
 #ifdef CONFIG_PPC64
        case 17:        /* sc */
index 7d1dba0d57f9e84daa25333a542d5bf6e88d69cc..8d035d2d42a62579bab50b19f605613d81ba56d5 100644 (file)
@@ -4,7 +4,8 @@ obj-$(CONFIG_MATH_EMULATION)    += fabs.o fadd.o fadds.o fcmpo.o fcmpu.o \
                                        fmadd.o fmadds.o fmsub.o fmsubs.o \
                                        fmul.o fmuls.o fnabs.o fneg.o \
                                        fnmadd.o fnmadds.o fnmsub.o fnmsubs.o \
-                                       fres.o frsp.o frsqrte.o fsel.o lfs.o \
+                                       fres.o fre.o frsp.o fsel.o lfs.o \
+                                       frsqrte.o frsqrtes.o \
                                        fsqrt.o fsqrts.o fsub.o fsubs.o \
                                        mcrfs.o mffs.o mtfsb0.o mtfsb1.o \
                                        mtfsf.o mtfsfi.o stfiwx.o stfs.o \
diff --git a/arch/powerpc/math-emu/fre.c b/arch/powerpc/math-emu/fre.c
new file mode 100644 (file)
index 0000000..49ccf2c
--- /dev/null
@@ -0,0 +1,11 @@
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <asm/uaccess.h>
+
+int fre(void *frD, void *frB)
+{
+#ifdef DEBUG
+       printk("%s: %p %p\n", __func__, frD, frB);
+#endif
+       return -ENOSYS;
+}
diff --git a/arch/powerpc/math-emu/frsqrtes.c b/arch/powerpc/math-emu/frsqrtes.c
new file mode 100644 (file)
index 0000000..7e838e3
--- /dev/null
@@ -0,0 +1,11 @@
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <asm/uaccess.h>
+
+int frsqrtes(void *frD, void *frB)
+{
+#ifdef DEBUG
+       printk("%s: %p %p\n", __func__, frD, frB);
+#endif
+       return 0;
+}
index 164d55935bd82ce7903d0e6717ad2cc029d422bf..0328e66e0799e6d7a5999430833df30a1c7296ab 100644 (file)
@@ -58,8 +58,10 @@ FLOATFUNC(fnabs);
 FLOATFUNC(fneg);
 
 /* Optional */
+FLOATFUNC(fre);
 FLOATFUNC(fres);
 FLOATFUNC(frsqrte);
+FLOATFUNC(frsqrtes);
 FLOATFUNC(fsel);
 FLOATFUNC(fsqrt);
 FLOATFUNC(fsqrts);
@@ -97,6 +99,7 @@ FLOATFUNC(fsqrts);
 #define FSQRTS         0x016           /*   22 */
 #define FRES           0x018           /*   24 */
 #define FMULS          0x019           /*   25 */
+#define FRSQRTES       0x01a           /*   26 */
 #define FMSUBS         0x01c           /*   28 */
 #define FMADDS         0x01d           /*   29 */
 #define FNMSUBS                0x01e           /*   30 */
@@ -109,6 +112,7 @@ FLOATFUNC(fsqrts);
 #define FADD           0x015           /*   21 */
 #define FSQRT          0x016           /*   22 */
 #define FSEL           0x017           /*   23 */
+#define FRE            0x018           /*   24 */
 #define FMUL           0x019           /*   25 */
 #define FRSQRTE                0x01a           /*   26 */
 #define FMSUB          0x01c           /*   28 */
@@ -299,9 +303,10 @@ do_mathemu(struct pt_regs *regs)
                case FDIVS:     func = fdivs;   type = AB;      break;
                case FSUBS:     func = fsubs;   type = AB;      break;
                case FADDS:     func = fadds;   type = AB;      break;
-               case FSQRTS:    func = fsqrts;  type = AB;      break;
-               case FRES:      func = fres;    type = AB;      break;
+               case FSQRTS:    func = fsqrts;  type = XB;      break;
+               case FRES:      func = fres;    type = XB;      break;
                case FMULS:     func = fmuls;   type = AC;      break;
+               case FRSQRTES:  func = frsqrtes;type = XB;      break;
                case FMSUBS:    func = fmsubs;  type = ABC;     break;
                case FMADDS:    func = fmadds;  type = ABC;     break;
                case FNMSUBS:   func = fnmsubs; type = ABC;     break;
@@ -317,10 +322,11 @@ do_mathemu(struct pt_regs *regs)
                        case FDIV:      func = fdiv;    type = AB;      break;
                        case FSUB:      func = fsub;    type = AB;      break;
                        case FADD:      func = fadd;    type = AB;      break;
-                       case FSQRT:     func = fsqrt;   type = AB;      break;
+                       case FSQRT:     func = fsqrt;   type = XB;      break;
+                       case FRE:       func = fre;     type = XB;      break;
                        case FSEL:      func = fsel;    type = ABC;     break;
                        case FMUL:      func = fmul;    type = AC;      break;
-                       case FRSQRTE:   func = frsqrte; type = AB;      break;
+                       case FRSQRTE:   func = frsqrte; type = XB;      break;
                        case FMSUB:     func = fmsub;   type = ABC;     break;
                        case FMADD:     func = fmadd;   type = ABC;     break;
                        case FNMSUB:    func = fnmsub;  type = ABC;     break;
index 2c9441ee6bb853c4ca6d650bb399c94b5ee5db15..82b1ff759e26d586145c58a6b0cb7c549a09057b 100644 (file)
@@ -41,7 +41,7 @@ int icache_44x_need_flush;
 
 unsigned long tlb_47x_boltmap[1024/8];
 
-static void __cpuinit ppc44x_update_tlb_hwater(void)
+static void ppc44x_update_tlb_hwater(void)
 {
        extern unsigned int tlb_44x_patch_hwater_D[];
        extern unsigned int tlb_44x_patch_hwater_I[];
@@ -134,7 +134,7 @@ static void __init ppc47x_update_boltmap(void)
 /*
  * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU
  */
-static void __cpuinit ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
+static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
 {
        unsigned int rA;
        int bolted;
@@ -229,7 +229,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 }
 
 #ifdef CONFIG_SMP
-void __cpuinit mmu_init_secondary(int cpu)
+void mmu_init_secondary(int cpu)
 {
        unsigned long addr;
        unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
index cf16b5733eaa368d1d905d1d2302cc432dcc9fec..51230ee6a4075170d9efcc52038a1c3919d4b495 100644 (file)
@@ -6,17 +6,16 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
 
 ccflags-$(CONFIG_PPC64)        := $(NO_MINIMAL_TOC)
 
-obj-y                          := fault.o mem.o pgtable.o gup.o \
+obj-y                          := fault.o mem.o pgtable.o gup.o mmap.o \
                                   init_$(CONFIG_WORD_SIZE).o \
                                   pgtable_$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_PPC_MMU_NOHASH)   += mmu_context_nohash.o tlb_nohash.o \
                                   tlb_nohash_low.o
 obj-$(CONFIG_PPC_BOOK3E)       += tlb_low_$(CONFIG_WORD_SIZE)e.o
-obj-$(CONFIG_PPC64)            += mmap_64.o
 hash64-$(CONFIG_PPC_NATIVE)    := hash_native_64.o
 obj-$(CONFIG_PPC_STD_MMU_64)   += hash_utils_64.o \
                                   slb_low.o slb.o stab.o \
-                                  mmap_64.o $(hash64-y)
+                                  $(hash64-y)
 obj-$(CONFIG_PPC_STD_MMU_32)   += ppc_mmu_32.o
 obj-$(CONFIG_PPC_STD_MMU)      += hash_low_$(CONFIG_WORD_SIZE).o \
                                   tlb_hash$(CONFIG_WORD_SIZE).o \
@@ -28,11 +27,12 @@ obj-$(CONFIG_44x)           += 44x_mmu.o
 obj-$(CONFIG_PPC_FSL_BOOK3E)   += fsl_booke_mmu.o
 obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
 obj-$(CONFIG_PPC_MM_SLICES)    += slice.o
-ifeq ($(CONFIG_HUGETLB_PAGE),y)
 obj-y                          += hugetlbpage.o
+ifeq ($(CONFIG_HUGETLB_PAGE),y)
 obj-$(CONFIG_PPC_STD_MMU_64)   += hugetlbpage-hash64.o
 obj-$(CONFIG_PPC_BOOK3E_MMU)   += hugetlbpage-book3e.o
 endif
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
 obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)          += highmem.o
index 4b921affa495d2e315e0598a06c34c0d2231be38..49822d90ea965ff6703db24fca3180b479c39101 100644 (file)
@@ -34,7 +34,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 
        ptep = pte_offset_kernel(&pmd, addr);
        do {
-               pte_t pte = *ptep;
+               pte_t pte = ACCESS_ONCE(*ptep);
                struct page *page;
 
                if ((pte_val(pte) & mask) != result)
@@ -63,12 +63,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 
        pmdp = pmd_offset(&pud, addr);
        do {
-               pmd_t pmd = *pmdp;
+               pmd_t pmd = ACCESS_ONCE(*pmdp);
 
                next = pmd_addr_end(addr, end);
-               if (pmd_none(pmd))
+               /*
+                * If we find a splitting transparent hugepage we
+                * return zero. That will result in taking the slow
+                * path which will call wait_split_huge_page()
+                * if the pmd is still in splitting state
+                */
+               if (pmd_none(pmd) || pmd_trans_splitting(pmd))
                        return 0;
-               if (pmd_huge(pmd)) {
+               if (pmd_huge(pmd) || pmd_large(pmd)) {
                        if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next,
                                         write, pages, nr))
                                return 0;
@@ -91,7 +97,7 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
 
        pudp = pud_offset(&pgd, addr);
        do {
-               pud_t pud = *pudp;
+               pud_t pud = ACCESS_ONCE(*pudp);
 
                next = pud_addr_end(addr, end);
                if (pud_none(pud))
@@ -154,7 +160,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 
        pgdp = pgd_offset(mm, addr);
        do {
-               pgd_t pgd = *pgdp;
+               pgd_t pgd = ACCESS_ONCE(*pgdp);
 
                pr_devel("  %016lx: normal pgd %p\n", addr,
                         (void *)pgd_val(pgd));
index 0e980acae67c410d7bf0c44c87753058b963487e..d3cbda62857b92ffb0867c4fe0ab1deeb0979313 100644 (file)
@@ -289,9 +289,10 @@ htab_modify_pte:
 
        /* Call ppc_md.hpte_updatepp */
        mr      r5,r29                  /* vpn */
-       li      r6,MMU_PAGE_4K          /* page size */
-       ld      r7,STK_PARAM(R9)(r1)    /* segment size */
-       ld      r8,STK_PARAM(R8)(r1)    /* get "local" param */
+       li      r6,MMU_PAGE_4K          /* base page size */
+       li      r7,MMU_PAGE_4K          /* actual page size */
+       ld      r8,STK_PARAM(R9)(r1)    /* segment size */
+       ld      r9,STK_PARAM(R8)(r1)    /* get "local" param */
 _GLOBAL(htab_call_hpte_updatepp)
        bl      .                       /* Patched by htab_finish_init() */
 
@@ -649,9 +650,10 @@ htab_modify_pte:
 
        /* Call ppc_md.hpte_updatepp */
        mr      r5,r29                  /* vpn */
-       li      r6,MMU_PAGE_4K          /* page size */
-       ld      r7,STK_PARAM(R9)(r1)    /* segment size */
-       ld      r8,STK_PARAM(R8)(r1)    /* get "local" param */
+       li      r6,MMU_PAGE_4K          /* base page size */
+       li      r7,MMU_PAGE_4K          /* actual page size */
+       ld      r8,STK_PARAM(R9)(r1)    /* segment size */
+       ld      r9,STK_PARAM(R8)(r1)    /* get "local" param */
 _GLOBAL(htab_call_hpte_updatepp)
        bl      .                       /* patched by htab_finish_init() */
 
@@ -937,9 +939,10 @@ ht64_modify_pte:
 
        /* Call ppc_md.hpte_updatepp */
        mr      r5,r29                  /* vpn */
-       li      r6,MMU_PAGE_64K
-       ld      r7,STK_PARAM(R9)(r1)    /* segment size */
-       ld      r8,STK_PARAM(R8)(r1)    /* get "local" param */
+       li      r6,MMU_PAGE_64K         /* base page size */
+       li      r7,MMU_PAGE_64K         /* actual page size */
+       ld      r8,STK_PARAM(R9)(r1)    /* segment size */
+       ld      r9,STK_PARAM(R8)(r1)    /* get "local" param */
 _GLOBAL(ht64_call_hpte_updatepp)
        bl      .                       /* patched by htab_finish_init() */
 
index 4c122c3f1623c7525e682338f555f73678263f72..3f0c30ae4791db7597c3fac74740e09e0fb3b7c1 100644 (file)
@@ -273,61 +273,15 @@ static long native_hpte_remove(unsigned long hpte_group)
        return i;
 }
 
-static inline int __hpte_actual_psize(unsigned int lp, int psize)
-{
-       int i, shift;
-       unsigned int mask;
-
-       /* start from 1 ignoring MMU_PAGE_4K */
-       for (i = 1; i < MMU_PAGE_COUNT; i++) {
-
-               /* invalid penc */
-               if (mmu_psize_defs[psize].penc[i] == -1)
-                       continue;
-               /*
-                * encoding bits per actual page size
-                *        PTE LP     actual page size
-                *    rrrr rrrz         >=8KB
-                *    rrrr rrzz         >=16KB
-                *    rrrr rzzz         >=32KB
-                *    rrrr zzzz         >=64KB
-                * .......
-                */
-               shift = mmu_psize_defs[i].shift - LP_SHIFT;
-               if (shift > LP_BITS)
-                       shift = LP_BITS;
-               mask = (1 << shift) - 1;
-               if ((lp & mask) == mmu_psize_defs[psize].penc[i])
-                       return i;
-       }
-       return -1;
-}
-
-static inline int hpte_actual_psize(struct hash_pte *hptep, int psize)
-{
-       /* Look at the 8 bit LP value */
-       unsigned int lp = (hptep->r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
-
-       if (!(hptep->v & HPTE_V_VALID))
-               return -1;
-
-       /* First check if it is large page */
-       if (!(hptep->v & HPTE_V_LARGE))
-               return MMU_PAGE_4K;
-
-       return __hpte_actual_psize(lp, psize);
-}
-
 static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
-                                unsigned long vpn, int psize, int ssize,
-                                int local)
+                                unsigned long vpn, int bpsize,
+                                int apsize, int ssize, int local)
 {
        struct hash_pte *hptep = htab_address + slot;
        unsigned long hpte_v, want_v;
        int ret = 0;
-       int actual_psize;
 
-       want_v = hpte_encode_avpn(vpn, psize, ssize);
+       want_v = hpte_encode_avpn(vpn, bpsize, ssize);
 
        DBG_LOW("    update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
                vpn, want_v & HPTE_V_AVPN, slot, newpp);
@@ -335,7 +289,6 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
        native_lock_hpte(hptep);
 
        hpte_v = hptep->v;
-       actual_psize = hpte_actual_psize(hptep, psize);
        /*
         * We need to invalidate the TLB always because hpte_remove doesn't do
         * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -343,12 +296,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
         * (hpte_remove) because we assume the old translation is still
         * technically "valid".
         */
-       if (actual_psize < 0) {
-               actual_psize = psize;
-               ret = -1;
-               goto err_out;
-       }
-       if (!HPTE_V_COMPARE(hpte_v, want_v)) {
+       if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
                DBG_LOW(" -> miss\n");
                ret = -1;
        } else {
@@ -357,11 +305,10 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
                hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
                        (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C));
        }
-err_out:
        native_unlock_hpte(hptep);
 
        /* Ensure it is out of the tlb too. */
-       tlbie(vpn, psize, actual_psize, ssize, local);
+       tlbie(vpn, bpsize, apsize, ssize, local);
 
        return ret;
 }
@@ -402,7 +349,6 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
 static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
                                       int psize, int ssize)
 {
-       int actual_psize;
        unsigned long vpn;
        unsigned long vsid;
        long slot;
@@ -415,36 +361,33 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
        if (slot == -1)
                panic("could not find page to bolt\n");
        hptep = htab_address + slot;
-       actual_psize = hpte_actual_psize(hptep, psize);
-       if (actual_psize < 0)
-               actual_psize = psize;
 
        /* Update the HPTE */
        hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
                (newpp & (HPTE_R_PP | HPTE_R_N));
-
-       /* Ensure it is out of the tlb too. */
-       tlbie(vpn, psize, actual_psize, ssize, 0);
+       /*
+        * Ensure it is out of the tlb too. Bolted entries base and
+        * actual page size will be same.
+        */
+       tlbie(vpn, psize, psize, ssize, 0);
 }
 
 static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
-                                  int psize, int ssize, int local)
+                                  int bpsize, int apsize, int ssize, int local)
 {
        struct hash_pte *hptep = htab_address + slot;
        unsigned long hpte_v;
        unsigned long want_v;
        unsigned long flags;
-       int actual_psize;
 
        local_irq_save(flags);
 
        DBG_LOW("    invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
 
-       want_v = hpte_encode_avpn(vpn, psize, ssize);
+       want_v = hpte_encode_avpn(vpn, bpsize, ssize);
        native_lock_hpte(hptep);
        hpte_v = hptep->v;
 
-       actual_psize = hpte_actual_psize(hptep, psize);
        /*
         * We need to invalidate the TLB always because hpte_remove doesn't do
         * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -452,23 +395,120 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
         * (hpte_remove) because we assume the old translation is still
         * technically "valid".
         */
-       if (actual_psize < 0) {
-               actual_psize = psize;
-               native_unlock_hpte(hptep);
-               goto err_out;
-       }
-       if (!HPTE_V_COMPARE(hpte_v, want_v))
+       if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
                native_unlock_hpte(hptep);
        else
                /* Invalidate the hpte. NOTE: this also unlocks it */
                hptep->v = 0;
 
-err_out:
        /* Invalidate the TLB */
-       tlbie(vpn, psize, actual_psize, ssize, local);
+       tlbie(vpn, bpsize, apsize, ssize, local);
+
+       local_irq_restore(flags);
+}
+
+static void native_hugepage_invalidate(struct mm_struct *mm,
+                                      unsigned char *hpte_slot_array,
+                                      unsigned long addr, int psize)
+{
+       int ssize = 0, i;
+       int lock_tlbie;
+       struct hash_pte *hptep;
+       int actual_psize = MMU_PAGE_16M;
+       unsigned int max_hpte_count, valid;
+       unsigned long flags, s_addr = addr;
+       unsigned long hpte_v, want_v, shift;
+       unsigned long hidx, vpn = 0, vsid, hash, slot;
+
+       shift = mmu_psize_defs[psize].shift;
+       max_hpte_count = 1U << (PMD_SHIFT - shift);
+
+       local_irq_save(flags);
+       for (i = 0; i < max_hpte_count; i++) {
+               valid = hpte_valid(hpte_slot_array, i);
+               if (!valid)
+                       continue;
+               hidx =  hpte_hash_index(hpte_slot_array, i);
+
+               /* get the vpn */
+               addr = s_addr + (i * (1ul << shift));
+               if (!is_kernel_addr(addr)) {
+                       ssize = user_segment_size(addr);
+                       vsid = get_vsid(mm->context.id, addr, ssize);
+                       WARN_ON(vsid == 0);
+               } else {
+                       vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+                       ssize = mmu_kernel_ssize;
+               }
+
+               vpn = hpt_vpn(addr, vsid, ssize);
+               hash = hpt_hash(vpn, shift, ssize);
+               if (hidx & _PTEIDX_SECONDARY)
+                       hash = ~hash;
+
+               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+               slot += hidx & _PTEIDX_GROUP_IX;
+
+               hptep = htab_address + slot;
+               want_v = hpte_encode_avpn(vpn, psize, ssize);
+               native_lock_hpte(hptep);
+               hpte_v = hptep->v;
+
+               /* Even if we miss, we need to invalidate the TLB */
+               if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+                       native_unlock_hpte(hptep);
+               else
+                       /* Invalidate the hpte. NOTE: this also unlocks it */
+                       hptep->v = 0;
+       }
+       /*
+        * Since this is a hugepage, we just need a single tlbie.
+        * use the last vpn.
+        */
+       lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+       if (lock_tlbie)
+               raw_spin_lock(&native_tlbie_lock);
+
+       asm volatile("ptesync":::"memory");
+       __tlbie(vpn, psize, actual_psize, ssize);
+       asm volatile("eieio; tlbsync; ptesync":::"memory");
+
+       if (lock_tlbie)
+               raw_spin_unlock(&native_tlbie_lock);
+
        local_irq_restore(flags);
 }
 
+static inline int __hpte_actual_psize(unsigned int lp, int psize)
+{
+       int i, shift;
+       unsigned int mask;
+
+       /* start from 1 ignoring MMU_PAGE_4K */
+       for (i = 1; i < MMU_PAGE_COUNT; i++) {
+
+               /* invalid penc */
+               if (mmu_psize_defs[psize].penc[i] == -1)
+                       continue;
+               /*
+                * encoding bits per actual page size
+                *        PTE LP     actual page size
+                *    rrrr rrrz         >=8KB
+                *    rrrr rrzz         >=16KB
+                *    rrrr rzzz         >=32KB
+                *    rrrr zzzz         >=64KB
+                * .......
+                */
+               shift = mmu_psize_defs[i].shift - LP_SHIFT;
+               if (shift > LP_BITS)
+                       shift = LP_BITS;
+               mask = (1 << shift) - 1;
+               if ((lp & mask) == mmu_psize_defs[psize].penc[i])
+                       return i;
+       }
+       return -1;
+}
+
 static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
                        int *psize, int *apsize, int *ssize, unsigned long *vpn)
 {
@@ -672,4 +712,5 @@ void __init hpte_init_native(void)
        ppc_md.hpte_remove      = native_hpte_remove;
        ppc_md.hpte_clear_all   = native_hpte_clear;
        ppc_md.flush_hash_range = native_flush_hash_range;
+       ppc_md.hugepage_invalidate   = native_hugepage_invalidate;
 }
index e303a6d74e3a72ca2f1db230073f7ec9896a9895..6ecc38bd5b2429eb08b89e3eb09cc6264c602e5c 100644 (file)
@@ -807,7 +807,7 @@ void __init early_init_mmu(void)
 }
 
 #ifdef CONFIG_SMP
-void __cpuinit early_init_mmu_secondary(void)
+void early_init_mmu_secondary(void)
 {
        /* Initialize hash table for that CPU */
        if (!firmware_has_feature(FW_FEATURE_LPAR))
@@ -1050,13 +1050,26 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
                goto bail;
        }
 
-#ifdef CONFIG_HUGETLB_PAGE
        if (hugeshift) {
-               rc = __hash_page_huge(ea, access, vsid, ptep, trap, local,
-                                       ssize, hugeshift, psize);
+               if (pmd_trans_huge(*(pmd_t *)ptep))
+                       rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
+                                            trap, local, ssize, psize);
+#ifdef CONFIG_HUGETLB_PAGE
+               else
+                       rc = __hash_page_huge(ea, access, vsid, ptep, trap,
+                                             local, ssize, hugeshift, psize);
+#else
+               else {
+                       /*
+                        * if we have hugeshift, and is not transhuge with
+                        * hugetlb disabled, something is really wrong.
+                        */
+                       rc = 1;
+                       WARN_ON(1);
+               }
+#endif
                goto bail;
        }
-#endif /* CONFIG_HUGETLB_PAGE */
 
 #ifndef CONFIG_PPC_64K_PAGES
        DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
@@ -1145,6 +1158,7 @@ EXPORT_SYMBOL_GPL(hash_page);
 void hash_preload(struct mm_struct *mm, unsigned long ea,
                  unsigned long access, unsigned long trap)
 {
+       int hugepage_shift;
        unsigned long vsid;
        pgd_t *pgdir;
        pte_t *ptep;
@@ -1166,10 +1180,27 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
        pgdir = mm->pgd;
        if (pgdir == NULL)
                return;
-       ptep = find_linux_pte(pgdir, ea);
-       if (!ptep)
+
+       /* Get VSID */
+       ssize = user_segment_size(ea);
+       vsid = get_vsid(mm->context.id, ea, ssize);
+       if (!vsid)
                return;
+       /*
+        * Hash doesn't like irqs. Walking linux page table with irq disabled
+        * saves us from holding multiple locks.
+        */
+       local_irq_save(flags);
+
+       /*
+        * THP pages use update_mmu_cache_pmd. We don't do
+        * hash preload there. Hence can ignore THP here
+        */
+       ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift);
+       if (!ptep)
+               goto out_exit;
 
+       WARN_ON(hugepage_shift);
 #ifdef CONFIG_PPC_64K_PAGES
        /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on
         * a 64K kernel), then we don't preload, hash_page() will take
@@ -1178,18 +1209,9 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
         * page size demotion here
         */
        if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE))
-               return;
+               goto out_exit;
 #endif /* CONFIG_PPC_64K_PAGES */
 
-       /* Get VSID */
-       ssize = user_segment_size(ea);
-       vsid = get_vsid(mm->context.id, ea, ssize);
-       if (!vsid)
-               return;
-
-       /* Hash doesn't like irqs */
-       local_irq_save(flags);
-
        /* Is that local to this CPU ? */
        if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
                local = 1;
@@ -1211,7 +1233,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
                                   mm->context.user_psize,
                                   mm->context.user_psize,
                                   pte_val(*ptep));
-
+out_exit:
        local_irq_restore(flags);
 }
 
@@ -1232,7 +1254,11 @@ void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
                slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
                slot += hidx & _PTEIDX_GROUP_IX;
                DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx);
-               ppc_md.hpte_invalidate(slot, vpn, psize, ssize, local);
+               /*
+                * We use same base page size and actual psize, because we don't
+                * use these functions for hugepage
+                */
+               ppc_md.hpte_invalidate(slot, vpn, psize, psize, ssize, local);
        } pte_iterate_hashed_end();
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@@ -1365,7 +1391,8 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
                hash = ~hash;
        slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
        slot += hidx & _PTEIDX_GROUP_IX;
-       ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_kernel_ssize, 0);
+       ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_linear_psize,
+                              mmu_kernel_ssize, 0);
 }
 
 void kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
new file mode 100644 (file)
index 0000000..34de9e0
--- /dev/null
@@ -0,0 +1,175 @@
+/*
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+/*
+ * PPC64 THP Support for hash based MMUs
+ */
+#include <linux/mm.h>
+#include <asm/machdep.h>
+
+int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
+                   pmd_t *pmdp, unsigned long trap, int local, int ssize,
+                   unsigned int psize)
+{
+       unsigned int index, valid;
+       unsigned char *hpte_slot_array;
+       unsigned long rflags, pa, hidx;
+       unsigned long old_pmd, new_pmd;
+       int ret, lpsize = MMU_PAGE_16M;
+       unsigned long vpn, hash, shift, slot;
+
+       /*
+        * atomically mark the linux large page PMD busy and dirty
+        */
+       do {
+               old_pmd = pmd_val(*pmdp);
+               /* If PMD busy, retry the access */
+               if (unlikely(old_pmd & _PAGE_BUSY))
+                       return 0;
+               /* If PMD is trans splitting retry the access */
+               if (unlikely(old_pmd & _PAGE_SPLITTING))
+                       return 0;
+               /* If PMD permissions don't match, take page fault */
+               if (unlikely(access & ~old_pmd))
+                       return 1;
+               /*
+                * Try to lock the PTE, add ACCESSED and DIRTY if it was
+                * a write access
+                */
+               new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED;
+               if (access & _PAGE_RW)
+                       new_pmd |= _PAGE_DIRTY;
+       } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp,
+                                         old_pmd, new_pmd));
+       /*
+        * PP bits. _PAGE_USER is already PP bit 0x2, so we only
+        * need to add in 0x1 if it's a read-only user page
+        */
+       rflags = new_pmd & _PAGE_USER;
+       if ((new_pmd & _PAGE_USER) && !((new_pmd & _PAGE_RW) &&
+                                          (new_pmd & _PAGE_DIRTY)))
+               rflags |= 0x1;
+       /*
+        * _PAGE_EXEC -> HW_NO_EXEC since it's inverted
+        */
+       rflags |= ((new_pmd & _PAGE_EXEC) ? 0 : HPTE_R_N);
+
+#if 0
+       if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+
+               /*
+                * No CPU has hugepages but lacks no execute, so we
+                * don't need to worry about that case
+                */
+               rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+       }
+#endif
+       /*
+        * Find the slot index details for this ea, using base page size.
+        */
+       shift = mmu_psize_defs[psize].shift;
+       index = (ea & ~HPAGE_PMD_MASK) >> shift;
+       BUG_ON(index >= 4096);
+
+       vpn = hpt_vpn(ea, vsid, ssize);
+       hash = hpt_hash(vpn, shift, ssize);
+       hpte_slot_array = get_hpte_slot_array(pmdp);
+
+       valid = hpte_valid(hpte_slot_array, index);
+       if (valid) {
+               /* update the hpte bits */
+               hidx =  hpte_hash_index(hpte_slot_array, index);
+               if (hidx & _PTEIDX_SECONDARY)
+                       hash = ~hash;
+               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+               slot += hidx & _PTEIDX_GROUP_IX;
+
+               ret = ppc_md.hpte_updatepp(slot, rflags, vpn,
+                                          psize, lpsize, ssize, local);
+               /*
+                * We failed to update, try to insert a new entry.
+                */
+               if (ret == -1) {
+                       /*
+                        * large pte is marked busy, so we can be sure
+                        * nobody is looking at hpte_slot_array. hence we can
+                        * safely update this here.
+                        */
+                       valid = 0;
+                       new_pmd &= ~_PAGE_HPTEFLAGS;
+                       hpte_slot_array[index] = 0;
+               } else
+                       /* clear the busy bits and set the hash pte bits */
+                       new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+       }
+
+       if (!valid) {
+               unsigned long hpte_group;
+
+               /* insert new entry */
+               pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
+repeat:
+               hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+
+               /* clear the busy bits and set the hash pte bits */
+               new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+
+               /* Add in WIMG bits */
+               rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
+                                     _PAGE_COHERENT | _PAGE_GUARDED));
+
+               /* Insert into the hash table, primary slot */
+               slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+                                         psize, lpsize, ssize);
+               /*
+                * Primary is full, try the secondary
+                */
+               if (unlikely(slot == -1)) {
+                       hpte_group = ((~hash & htab_hash_mask) *
+                                     HPTES_PER_GROUP) & ~0x7UL;
+                       slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
+                                                 rflags, HPTE_V_SECONDARY,
+                                                 psize, lpsize, ssize);
+                       if (slot == -1) {
+                               if (mftb() & 0x1)
+                                       hpte_group = ((hash & htab_hash_mask) *
+                                                     HPTES_PER_GROUP) & ~0x7UL;
+
+                               ppc_md.hpte_remove(hpte_group);
+                               goto repeat;
+                       }
+               }
+               /*
+                * Hypervisor failure. Restore old pmd and return -1
+                * similar to __hash_page_*
+                */
+               if (unlikely(slot == -2)) {
+                       *pmdp = __pmd(old_pmd);
+                       hash_failure_debug(ea, access, vsid, trap, ssize,
+                                          psize, lpsize, old_pmd);
+                       return -1;
+               }
+               /*
+                * large pte is marked busy, so we can be sure
+                * nobody is looking at hpte_slot_array. hence we can
+                * safely update this here.
+                */
+               mark_hpte_slot_valid(hpte_slot_array, index, slot);
+       }
+       /*
+        * No need to use ldarx/stdcx here
+        */
+       *pmdp = __pmd(new_pmd & ~_PAGE_BUSY);
+       return 0;
+}
index 0f1d94a1fb82328db2605606f8fd0e3e4354fb15..0b7fb6761015526a38be8dac7ee51d0ca4d9f161 100644 (file)
@@ -81,7 +81,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
                slot += (old_pte & _PAGE_F_GIX) >> 12;
 
                if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize,
-                                        ssize, local) == -1)
+                                        mmu_psize, ssize, local) == -1)
                        old_pte &= ~_PAGE_HPTEFLAGS;
        }
 
index 77fdd2cef33b5a2c18ac40c442645d9dc38365ad..5555778b94fcbe3e7e935310b9c5048079199f3b 100644 (file)
@@ -21,6 +21,9 @@
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
 #include <asm/setup.h>
+#include <asm/hugetlb.h>
+
+#ifdef CONFIG_HUGETLB_PAGE
 
 #define PAGE_SHIFT_64K 16
 #define PAGE_SHIFT_16M 24
@@ -100,68 +103,9 @@ int pgd_huge(pgd_t pgd)
 }
 #endif
 
-/*
- * We have 4 cases for pgds and pmds:
- * (1) invalid (all zeroes)
- * (2) pointer to next table, as normal; bottom 6 bits == 0
- * (3) leaf pte for huge page, bottom two bits != 00
- * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
- */
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
-{
-       pgd_t *pg;
-       pud_t *pu;
-       pmd_t *pm;
-       pte_t *ret_pte;
-       hugepd_t *hpdp = NULL;
-       unsigned pdshift = PGDIR_SHIFT;
-
-       if (shift)
-               *shift = 0;
-
-       pg = pgdir + pgd_index(ea);
-
-       if (pgd_huge(*pg)) {
-               ret_pte = (pte_t *) pg;
-               goto out;
-       } else if (is_hugepd(pg))
-               hpdp = (hugepd_t *)pg;
-       else if (!pgd_none(*pg)) {
-               pdshift = PUD_SHIFT;
-               pu = pud_offset(pg, ea);
-
-               if (pud_huge(*pu)) {
-                       ret_pte = (pte_t *) pu;
-                       goto out;
-               } else if (is_hugepd(pu))
-                       hpdp = (hugepd_t *)pu;
-               else if (!pud_none(*pu)) {
-                       pdshift = PMD_SHIFT;
-                       pm = pmd_offset(pu, ea);
-
-                       if (pmd_huge(*pm)) {
-                               ret_pte = (pte_t *) pm;
-                               goto out;
-                       } else if (is_hugepd(pm))
-                               hpdp = (hugepd_t *)pm;
-                       else if (!pmd_none(*pm))
-                               return pte_offset_kernel(pm, ea);
-               }
-       }
-       if (!hpdp)
-               return NULL;
-
-       ret_pte = hugepte_offset(hpdp, ea, pdshift);
-       pdshift = hugepd_shift(*hpdp);
-out:
-       if (shift)
-               *shift = pdshift;
-       return ret_pte;
-}
-EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
-
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
+       /* Only called for hugetlbfs pages, hence can ignore THP */
        return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
 }
 
@@ -736,11 +680,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
        struct page *page;
        unsigned shift;
        unsigned long mask;
-
+       /*
+        * Transparent hugepages are handled by generic code. We can skip them
+        * here.
+        */
        ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
 
        /* Verify it is a huge page else bail. */
-       if (!ptep || !shift)
+       if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep))
                return ERR_PTR(-EINVAL);
 
        mask = (1UL << shift) - 1;
@@ -759,69 +706,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
        return NULL;
 }
 
-int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
-               unsigned long end, int write, struct page **pages, int *nr)
-{
-       unsigned long mask;
-       unsigned long pte_end;
-       struct page *head, *page, *tail;
-       pte_t pte;
-       int refs;
-
-       pte_end = (addr + sz) & ~(sz-1);
-       if (pte_end < end)
-               end = pte_end;
-
-       pte = *ptep;
-       mask = _PAGE_PRESENT | _PAGE_USER;
-       if (write)
-               mask |= _PAGE_RW;
-
-       if ((pte_val(pte) & mask) != mask)
-               return 0;
-
-       /* hugepages are never "special" */
-       VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-
-       refs = 0;
-       head = pte_page(pte);
-
-       page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
-       tail = page;
-       do {
-               VM_BUG_ON(compound_head(page) != head);
-               pages[*nr] = page;
-               (*nr)++;
-               page++;
-               refs++;
-       } while (addr += PAGE_SIZE, addr != end);
-
-       if (!page_cache_add_speculative(head, refs)) {
-               *nr -= refs;
-               return 0;
-       }
-
-       if (unlikely(pte_val(pte) != pte_val(*ptep))) {
-               /* Could be optimized better */
-               *nr -= refs;
-               while (refs--)
-                       put_page(head);
-               return 0;
-       }
-
-       /*
-        * Any tail page need their mapcount reference taken before we
-        * return.
-        */
-       while (refs--) {
-               if (PageTail(tail))
-                       get_huge_page_tail(tail);
-               tail++;
-       }
-
-       return 1;
-}
-
 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
                                      unsigned long sz)
 {
@@ -1038,3 +922,168 @@ void flush_dcache_icache_hugepage(struct page *page)
                }
        }
 }
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
+/*
+ * We have 4 cases for pgds and pmds:
+ * (1) invalid (all zeroes)
+ * (2) pointer to next table, as normal; bottom 6 bits == 0
+ * (3) leaf pte for huge page, bottom two bits != 00
+ * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
+ *
+ * So long as we atomically load page table pointers we are safe against teardown,
+ * we can follow the address down to the the page and take a ref on it.
+ */
+
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
+{
+       pgd_t pgd, *pgdp;
+       pud_t pud, *pudp;
+       pmd_t pmd, *pmdp;
+       pte_t *ret_pte;
+       hugepd_t *hpdp = NULL;
+       unsigned pdshift = PGDIR_SHIFT;
+
+       if (shift)
+               *shift = 0;
+
+       pgdp = pgdir + pgd_index(ea);
+       pgd  = ACCESS_ONCE(*pgdp);
+       /*
+        * Always operate on the local stack value. This make sure the
+        * value don't get updated by a parallel THP split/collapse,
+        * page fault or a page unmap. The return pte_t * is still not
+        * stable. So should be checked there for above conditions.
+        */
+       if (pgd_none(pgd))
+               return NULL;
+       else if (pgd_huge(pgd)) {
+               ret_pte = (pte_t *) pgdp;
+               goto out;
+       } else if (is_hugepd(&pgd))
+               hpdp = (hugepd_t *)&pgd;
+       else {
+               /*
+                * Even if we end up with an unmap, the pgtable will not
+                * be freed, because we do an rcu free and here we are
+                * irq disabled
+                */
+               pdshift = PUD_SHIFT;
+               pudp = pud_offset(&pgd, ea);
+               pud  = ACCESS_ONCE(*pudp);
+
+               if (pud_none(pud))
+                       return NULL;
+               else if (pud_huge(pud)) {
+                       ret_pte = (pte_t *) pudp;
+                       goto out;
+               } else if (is_hugepd(&pud))
+                       hpdp = (hugepd_t *)&pud;
+               else {
+                       pdshift = PMD_SHIFT;
+                       pmdp = pmd_offset(&pud, ea);
+                       pmd  = ACCESS_ONCE(*pmdp);
+                       /*
+                        * A hugepage collapse is captured by pmd_none, because
+                        * it mark the pmd none and do a hpte invalidate.
+                        *
+                        * A hugepage split is captured by pmd_trans_splitting
+                        * because we mark the pmd trans splitting and do a
+                        * hpte invalidate
+                        *
+                        */
+                       if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+                               return NULL;
+
+                       if (pmd_huge(pmd) || pmd_large(pmd)) {
+                               ret_pte = (pte_t *) pmdp;
+                               goto out;
+                       } else if (is_hugepd(&pmd))
+                               hpdp = (hugepd_t *)&pmd;
+                       else
+                               return pte_offset_kernel(&pmd, ea);
+               }
+       }
+       if (!hpdp)
+               return NULL;
+
+       ret_pte = hugepte_offset(hpdp, ea, pdshift);
+       pdshift = hugepd_shift(*hpdp);
+out:
+       if (shift)
+               *shift = pdshift;
+       return ret_pte;
+}
+EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
+
+int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+               unsigned long end, int write, struct page **pages, int *nr)
+{
+       unsigned long mask;
+       unsigned long pte_end;
+       struct page *head, *page, *tail;
+       pte_t pte;
+       int refs;
+
+       pte_end = (addr + sz) & ~(sz-1);
+       if (pte_end < end)
+               end = pte_end;
+
+       pte = ACCESS_ONCE(*ptep);
+       mask = _PAGE_PRESENT | _PAGE_USER;
+       if (write)
+               mask |= _PAGE_RW;
+
+       if ((pte_val(pte) & mask) != mask)
+               return 0;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       /*
+        * check for splitting here
+        */
+       if (pmd_trans_splitting(pte_pmd(pte)))
+               return 0;
+#endif
+
+       /* hugepages are never "special" */
+       VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+       refs = 0;
+       head = pte_page(pte);
+
+       page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+       tail = page;
+       do {
+               VM_BUG_ON(compound_head(page) != head);
+               pages[*nr] = page;
+               (*nr)++;
+               page++;
+               refs++;
+       } while (addr += PAGE_SIZE, addr != end);
+
+       if (!page_cache_add_speculative(head, refs)) {
+               *nr -= refs;
+               return 0;
+       }
+
+       if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+               /* Could be optimized better */
+               *nr -= refs;
+               while (refs--)
+                       put_page(head);
+               return 0;
+       }
+
+       /*
+        * Any tail page need their mapcount reference taken before we
+        * return.
+        */
+       while (refs--) {
+               if (PageTail(tail))
+                       get_huge_page_tail(tail);
+               tail++;
+       }
+
+       return 1;
+}
index a90b9c4589908078f139d953707b6ce13b6860d0..d0cd9e4c6837d2d17620b0f646151d9a54408f71 100644 (file)
@@ -88,7 +88,11 @@ static void pgd_ctor(void *addr)
 
 static void pmd_ctor(void *addr)
 {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       memset(addr, 0, PMD_TABLE_SIZE * 2);
+#else
        memset(addr, 0, PMD_TABLE_SIZE);
+#endif
 }
 
 struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
@@ -137,10 +141,9 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
 void pgtable_cache_init(void)
 {
        pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
-       pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor);
-       if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE))
+       pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
+       if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX))
                panic("Couldn't allocate pgtable caches");
-
        /* In all current configs, when the PUD index exists it's the
         * same size as either the pgd or pmd index.  Verify that the
         * initialization above has also created a PUD cache.  This
index 0988a26e04131e5408f8f1f3a16627b21699ad4a..ccd49f9503a95104dff61ccbec25f32082e0c82e 100644 (file)
@@ -508,6 +508,10 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
                      pte_t *ptep)
 {
 #ifdef CONFIG_PPC_STD_MMU
+       /*
+        * We don't need to worry about _PAGE_PRESENT here because we are
+        * called with either mm->page_table_lock held or ptl lock held
+        */
        unsigned long access = 0, trap;
 
        /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
index e779642c25e5e3192a39a4b167d505a8cfb63254..af3d78e193026a6df95f6b4946fdf5a74236a864 100644 (file)
@@ -112,8 +112,10 @@ static unsigned int steal_context_smp(unsigned int id)
                 */
                for_each_cpu(cpu, mm_cpumask(mm)) {
                        for (i = cpu_first_thread_sibling(cpu);
-                            i <= cpu_last_thread_sibling(cpu); i++)
-                               __set_bit(id, stale_map[i]);
+                            i <= cpu_last_thread_sibling(cpu); i++) {
+                               if (stale_map[i])
+                                       __set_bit(id, stale_map[i]);
+                       }
                        cpu = i - 1;
                }
                return id;
@@ -272,7 +274,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
                /* XXX This clear should ultimately be part of local_flush_tlb_mm */
                for (i = cpu_first_thread_sibling(cpu);
                     i <= cpu_last_thread_sibling(cpu); i++) {
-                       __clear_bit(id, stale_map[i]);
+                       if (stale_map[i])
+                               __clear_bit(id, stale_map[i]);
                }
        }
 
@@ -329,8 +332,8 @@ void destroy_context(struct mm_struct *mm)
 
 #ifdef CONFIG_SMP
 
-static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
-                                           unsigned long action, void *hcpu)
+static int mmu_context_cpu_notify(struct notifier_block *self,
+                                 unsigned long action, void *hcpu)
 {
        unsigned int cpu = (unsigned int)(long)hcpu;
 
@@ -363,7 +366,7 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata mmu_context_cpu_nb = {
+static struct notifier_block mmu_context_cpu_nb = {
        .notifier_call  = mmu_context_cpu_notify,
 };
 
index 88c0425dc0a88f41f61c38b4d01888ec468a937c..08397217e8ace2d6ce3abc1a09a4fcc14977413f 100644 (file)
@@ -516,7 +516,7 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
  * Figure out to which domain a cpu belongs and stick it there.
  * Return the id of the domain used.
  */
-static int __cpuinit numa_setup_cpu(unsigned long lcpu)
+static int numa_setup_cpu(unsigned long lcpu)
 {
        int nid = 0;
        struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
@@ -538,8 +538,7 @@ out:
        return nid;
 }
 
-static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
-                            unsigned long action,
+static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action,
                             void *hcpu)
 {
        unsigned long lcpu = (unsigned long)hcpu;
@@ -919,7 +918,7 @@ static void __init *careful_zallocation(int nid, unsigned long size,
        return ret;
 }
 
-static struct notifier_block __cpuinitdata ppc64_numa_nb = {
+static struct notifier_block ppc64_numa_nb = {
        .notifier_call = cpu_numa_callback,
        .priority = 1 /* Must run before sched domains notifier. */
 };
@@ -1433,11 +1432,9 @@ static int update_cpu_topology(void *data)
                if (cpu != update->cpu)
                        continue;
 
-               unregister_cpu_under_node(update->cpu, update->old_nid);
                unmap_cpu_from_node(update->cpu);
                map_cpu_to_node(update->cpu, update->new_nid);
                vdso_getcpu_init();
-               register_cpu_under_node(update->cpu, update->new_nid);
        }
 
        return 0;
@@ -1485,6 +1482,9 @@ int arch_update_cpu_topology(void)
        stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
 
        for (ud = &updates[0]; ud; ud = ud->next) {
+               unregister_cpu_under_node(ud->cpu, ud->old_nid);
+               register_cpu_under_node(ud->cpu, ud->new_nid);
+
                dev = get_cpu_device(ud->cpu);
                if (dev)
                        kobject_uevent(&dev->kobj, KOBJ_CHANGE);
index 214130a4edc6bf2ff82b35bde8d955d98c75b01d..edda589795c3e30c09a900261aa11da6f6fea0fe 100644 (file)
@@ -235,6 +235,14 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
        pud = pud_offset(pgd, addr);
        BUG_ON(pud_none(*pud));
        pmd = pmd_offset(pud, addr);
+       /*
+        * khugepaged to collapse normal pages to hugepage, first set
+        * pmd to none to force page fault/gup to take mmap_sem. After
+        * pmd is set to none, we do a pte_clear which does this assertion
+        * so if we find pmd none, return.
+        */
+       if (pmd_none(*pmd))
+               return;
        BUG_ON(!pmd_present(*pmd));
        assert_spin_locked(pte_lockptr(mm, pmd));
 }
index a854096e102329608e4b1f33bd5f5886cfefe46b..536eec72c0f701584b717b86189d039442c54acb 100644 (file)
@@ -338,6 +338,19 @@ EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(__iounmap);
 EXPORT_SYMBOL(__iounmap_at);
 
+/*
+ * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
+ * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
+ */
+struct page *pmd_page(pmd_t pmd)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (pmd_trans_huge(pmd))
+               return pfn_to_page(pmd_pfn(pmd));
+#endif
+       return virt_to_page(pmd_page_vaddr(pmd));
+}
+
 #ifdef CONFIG_PPC_64K_PAGES
 static pte_t *get_from_cache(struct mm_struct *mm)
 {
@@ -455,3 +468,404 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
 }
 #endif
 #endif /* CONFIG_PPC_64K_PAGES */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+/*
+ * This is called when relaxing access to a hugepage. It's also called in the page
+ * fault path when we don't hit any of the major fault cases, ie, a minor
+ * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
+ * handled those two for us, we additionally deal with missing execute
+ * permission here on some processors
+ */
+int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+                         pmd_t *pmdp, pmd_t entry, int dirty)
+{
+       int changed;
+#ifdef CONFIG_DEBUG_VM
+       WARN_ON(!pmd_trans_huge(*pmdp));
+       assert_spin_locked(&vma->vm_mm->page_table_lock);
+#endif
+       changed = !pmd_same(*(pmdp), entry);
+       if (changed) {
+               __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
+               /*
+                * Since we are not supporting SW TLB systems, we don't
+                * have any thing similar to flush_tlb_page_nohash()
+                */
+       }
+       return changed;
+}
+
+unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+                                 pmd_t *pmdp, unsigned long clr)
+{
+
+       unsigned long old, tmp;
+
+#ifdef CONFIG_DEBUG_VM
+       WARN_ON(!pmd_trans_huge(*pmdp));
+       assert_spin_locked(&mm->page_table_lock);
+#endif
+
+#ifdef PTE_ATOMIC_UPDATES
+       __asm__ __volatile__(
+       "1:     ldarx   %0,0,%3\n\
+               andi.   %1,%0,%6\n\
+               bne-    1b \n\
+               andc    %1,%0,%4 \n\
+               stdcx.  %1,0,%3 \n\
+               bne-    1b"
+       : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
+       : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY)
+       : "cc" );
+#else
+       old = pmd_val(*pmdp);
+       *pmdp = __pmd(old & ~clr);
+#endif
+       if (old & _PAGE_HASHPTE)
+               hpte_do_hugepage_flush(mm, addr, pmdp);
+       return old;
+}
+
+pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+                      pmd_t *pmdp)
+{
+       pmd_t pmd;
+
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+       if (pmd_trans_huge(*pmdp)) {
+               pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+       } else {
+               /*
+                * khugepaged calls this for normal pmd
+                */
+               pmd = *pmdp;
+               pmd_clear(pmdp);
+               /*
+                * Wait for all pending hash_page to finish. This is needed
+                * in case of subpage collapse. When we collapse normal pages
+                * to hugepage, we first clear the pmd, then invalidate all
+                * the PTE entries. The assumption here is that any low level
+                * page fault will see a none pmd and take the slow path that
+                * will wait on mmap_sem. But we could very well be in a
+                * hash_page with local ptep pointer value. Such a hash page
+                * can result in adding new HPTE entries for normal subpages.
+                * That means we could be modifying the page content as we
+                * copy them to a huge page. So wait for parallel hash_page
+                * to finish before invalidating HPTE entries. We can do this
+                * by sending an IPI to all the cpus and executing a dummy
+                * function there.
+                */
+               kick_all_cpus_sync();
+               /*
+                * Now invalidate the hpte entries in the range
+                * covered by pmd. This make sure we take a
+                * fault and will find the pmd as none, which will
+                * result in a major fault which takes mmap_sem and
+                * hence wait for collapse to complete. Without this
+                * the __collapse_huge_page_copy can result in copying
+                * the old content.
+                */
+               flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
+       }
+       return pmd;
+}
+
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                             unsigned long address, pmd_t *pmdp)
+{
+       return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+
+/*
+ * We currently remove entries from the hashtable regardless of whether
+ * the entry was young or dirty. The generic routines only flush if the
+ * entry was young or dirty which is not good enough.
+ *
+ * We should be more intelligent about this but for the moment we override
+ * these functions and force a tlb flush unconditionally
+ */
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+                                 unsigned long address, pmd_t *pmdp)
+{
+       return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+
+/*
+ * We mark the pmd splitting and invalidate all the hpte
+ * entries for this hugepage.
+ */
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+                         unsigned long address, pmd_t *pmdp)
+{
+       unsigned long old, tmp;
+
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+#ifdef CONFIG_DEBUG_VM
+       WARN_ON(!pmd_trans_huge(*pmdp));
+       assert_spin_locked(&vma->vm_mm->page_table_lock);
+#endif
+
+#ifdef PTE_ATOMIC_UPDATES
+
+       __asm__ __volatile__(
+       "1:     ldarx   %0,0,%3\n\
+               andi.   %1,%0,%6\n\
+               bne-    1b \n\
+               ori     %1,%0,%4 \n\
+               stdcx.  %1,0,%3 \n\
+               bne-    1b"
+       : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
+       : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
+       : "cc" );
+#else
+       old = pmd_val(*pmdp);
+       *pmdp = __pmd(old | _PAGE_SPLITTING);
+#endif
+       /*
+        * If we didn't had the splitting flag set, go and flush the
+        * HPTE entries.
+        */
+       if (!(old & _PAGE_SPLITTING)) {
+               /* We need to flush the hpte */
+               if (old & _PAGE_HASHPTE)
+                       hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
+       }
+}
+
+/*
+ * We want to put the pgtable in pmd and use pgtable for tracking
+ * the base page size hptes
+ */
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+                               pgtable_t pgtable)
+{
+       pgtable_t *pgtable_slot;
+       assert_spin_locked(&mm->page_table_lock);
+       /*
+        * we store the pgtable in the second half of PMD
+        */
+       pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+       *pgtable_slot = pgtable;
+       /*
+        * expose the deposited pgtable to other cpus.
+        * before we set the hugepage PTE at pmd level
+        * hash fault code looks at the deposted pgtable
+        * to store hash index values.
+        */
+       smp_wmb();
+}
+
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+       pgtable_t pgtable;
+       pgtable_t *pgtable_slot;
+
+       assert_spin_locked(&mm->page_table_lock);
+       pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+       pgtable = *pgtable_slot;
+       /*
+        * Once we withdraw, mark the entry NULL.
+        */
+       *pgtable_slot = NULL;
+       /*
+        * We store HPTE information in the deposited PTE fragment.
+        * zero out the content on withdraw.
+        */
+       memset(pgtable, 0, PTE_FRAG_SIZE);
+       return pgtable;
+}
+
+/*
+ * set a new huge pmd. We should not be called for updating
+ * an existing pmd entry. That should go via pmd_hugepage_update.
+ */
+void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+               pmd_t *pmdp, pmd_t pmd)
+{
+#ifdef CONFIG_DEBUG_VM
+       WARN_ON(!pmd_none(*pmdp));
+       assert_spin_locked(&mm->page_table_lock);
+       WARN_ON(!pmd_trans_huge(pmd));
+#endif
+       return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
+}
+
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+                    pmd_t *pmdp)
+{
+       pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT);
+}
+
+/*
+ * A linux hugepage PMD was changed and the corresponding hash table entries
+ * neesd to be flushed.
+ */
+void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+                           pmd_t *pmdp)
+{
+       int ssize, i;
+       unsigned long s_addr;
+       int max_hpte_count;
+       unsigned int psize, valid;
+       unsigned char *hpte_slot_array;
+       unsigned long hidx, vpn, vsid, hash, shift, slot;
+
+       /*
+        * Flush all the hptes mapping this hugepage
+        */
+       s_addr = addr & HPAGE_PMD_MASK;
+       hpte_slot_array = get_hpte_slot_array(pmdp);
+       /*
+        * IF we try to do a HUGE PTE update after a withdraw is done.
+        * we will find the below NULL. This happens when we do
+        * split_huge_page_pmd
+        */
+       if (!hpte_slot_array)
+               return;
+
+       /* get the base page size */
+       psize = get_slice_psize(mm, s_addr);
+
+       if (ppc_md.hugepage_invalidate)
+               return ppc_md.hugepage_invalidate(mm, hpte_slot_array,
+                                                 s_addr, psize);
+       /*
+        * No bluk hpte removal support, invalidate each entry
+        */
+       shift = mmu_psize_defs[psize].shift;
+       max_hpte_count = HPAGE_PMD_SIZE >> shift;
+       for (i = 0; i < max_hpte_count; i++) {
+               /*
+                * 8 bits per each hpte entries
+                * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
+                */
+               valid = hpte_valid(hpte_slot_array, i);
+               if (!valid)
+                       continue;
+               hidx =  hpte_hash_index(hpte_slot_array, i);
+
+               /* get the vpn */
+               addr = s_addr + (i * (1ul << shift));
+               if (!is_kernel_addr(addr)) {
+                       ssize = user_segment_size(addr);
+                       vsid = get_vsid(mm->context.id, addr, ssize);
+                       WARN_ON(vsid == 0);
+               } else {
+                       vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+                       ssize = mmu_kernel_ssize;
+               }
+
+               vpn = hpt_vpn(addr, vsid, ssize);
+               hash = hpt_hash(vpn, shift, ssize);
+               if (hidx & _PTEIDX_SECONDARY)
+                       hash = ~hash;
+
+               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+               slot += hidx & _PTEIDX_GROUP_IX;
+               ppc_md.hpte_invalidate(slot, vpn, psize,
+                                      MMU_PAGE_16M, ssize, 0);
+       }
+}
+
+static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
+{
+       pmd_val(pmd) |= pgprot_val(pgprot);
+       return pmd;
+}
+
+pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
+{
+       pmd_t pmd;
+       /*
+        * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always
+        * set. We use this to check THP page at pmd level.
+        * leaf pte for huge page, bottom two bits != 00
+        */
+       pmd_val(pmd) = pfn << PTE_RPN_SHIFT;
+       pmd_val(pmd) |= _PAGE_THP_HUGE;
+       pmd = pmd_set_protbits(pmd, pgprot);
+       return pmd;
+}
+
+pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
+{
+       return pfn_pmd(page_to_pfn(page), pgprot);
+}
+
+pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+
+       pmd_val(pmd) &= _HPAGE_CHG_MASK;
+       pmd = pmd_set_protbits(pmd, newprot);
+       return pmd;
+}
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a HUGE PMD entry in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux HUGE PMD entry.
+ */
+void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+                         pmd_t *pmd)
+{
+       return;
+}
+
+pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+                        unsigned long addr, pmd_t *pmdp)
+{
+       pmd_t old_pmd;
+       pgtable_t pgtable;
+       unsigned long old;
+       pgtable_t *pgtable_slot;
+
+       old = pmd_hugepage_update(mm, addr, pmdp, ~0UL);
+       old_pmd = __pmd(old);
+       /*
+        * We have pmd == none and we are holding page_table_lock.
+        * So we can safely go and clear the pgtable hash
+        * index info.
+        */
+       pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+       pgtable = *pgtable_slot;
+       /*
+        * Let's zero out old valid and hash index details
+        * hash fault look at them.
+        */
+       memset(pgtable, 0, PTE_FRAG_SIZE);
+       return old_pmd;
+}
+
+int has_transparent_hugepage(void)
+{
+       if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+               return 0;
+       /*
+        * We support THP only if PMD_SIZE is 16MB.
+        */
+       if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
+               return 0;
+       /*
+        * We need to make sure that we support 16MB hugepage in a segement
+        * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
+        * of 64K.
+        */
+       /*
+        * If we have 64K HPTE, we will be using that by default
+        */
+       if (mmu_psize_defs[MMU_PAGE_64K].shift &&
+           (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
+               return 0;
+       /*
+        * Ok we only have 4K HPTE
+        */
+       if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
+               return 0;
+
+       return 1;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
index 7c415ddde948b9dcf3ad11c54ad94c647da278b1..aa74acb0fdfcb441c60fb3499da96b90ca88e72a 100644 (file)
@@ -130,6 +130,53 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
        up_write(&mm->mmap_sem);
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
+                                 unsigned long end, struct mm_walk *walk)
+{
+       struct vm_area_struct *vma = walk->private;
+       split_huge_page_pmd(vma, addr, pmd);
+       return 0;
+}
+
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+                                   unsigned long len)
+{
+       struct vm_area_struct *vma;
+       struct mm_walk subpage_proto_walk = {
+               .mm = mm,
+               .pmd_entry = subpage_walk_pmd_entry,
+       };
+
+       /*
+        * We don't try too hard, we just mark all the vma in that range
+        * VM_NOHUGEPAGE and split them.
+        */
+       vma = find_vma(mm, addr);
+       /*
+        * If the range is in unmapped range, just return
+        */
+       if (vma && ((addr + len) <= vma->vm_start))
+               return;
+
+       while (vma) {
+               if (vma->vm_start >= (addr + len))
+                       break;
+               vma->vm_flags |= VM_NOHUGEPAGE;
+               subpage_proto_walk.private = vma;
+               walk_page_range(vma->vm_start, vma->vm_end,
+                               &subpage_proto_walk);
+               vma = vma->vm_next;
+       }
+}
+#else
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+                                   unsigned long len)
+{
+       return;
+}
+#endif
+
 /*
  * Copy in a subpage protection map for an address range.
  * The map has 2 bits per 4k subpage, so 32 bits per 64k page.
@@ -168,6 +215,7 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
                return -EFAULT;
 
        down_write(&mm->mmap_sem);
+       subpage_mark_vma_nohuge(mm, addr, len);
        for (limit = addr + len; addr < limit; addr = next) {
                next = pmd_addr_end(addr, limit);
                err = -ENOMEM;
index 023ec8a13f38eed82e45fe37fc0f6e4be9518b83..313c85c5aa90b7657e2b9ff5effbb6ca2acf3847 100644 (file)
@@ -189,6 +189,7 @@ void tlb_flush(struct mmu_gather *tlb)
 void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
                              unsigned long end)
 {
+       int hugepage_shift;
        unsigned long flags;
 
        start = _ALIGN_DOWN(start, PAGE_SIZE);
@@ -206,7 +207,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
        local_irq_save(flags);
        arch_enter_lazy_mmu_mode();
        for (; start < end; start += PAGE_SIZE) {
-               pte_t *ptep = find_linux_pte(mm->pgd, start);
+               pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start,
+                                                       &hugepage_shift);
                unsigned long pte;
 
                if (ptep == NULL)
@@ -214,7 +216,37 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
                pte = pte_val(*ptep);
                if (!(pte & _PAGE_HASHPTE))
                        continue;
-               hpte_need_flush(mm, start, ptep, pte, 0);
+               if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte)))
+                       hpte_do_hugepage_flush(mm, start, (pmd_t *)pte);
+               else
+                       hpte_need_flush(mm, start, ptep, pte, 0);
+       }
+       arch_leave_lazy_mmu_mode();
+       local_irq_restore(flags);
+}
+
+void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
+{
+       pte_t *pte;
+       pte_t *start_pte;
+       unsigned long flags;
+
+       addr = _ALIGN_DOWN(addr, PMD_SIZE);
+       /* Note: Normally, we should only ever use a batch within a
+        * PTE locked section. This violates the rule, but will work
+        * since we don't actually modify the PTEs, we just flush the
+        * hash while leaving the PTEs intact (including their reference
+        * to being hashed). This is not the most performance oriented
+        * way to do things but is fine for our needs here.
+        */
+       local_irq_save(flags);
+       arch_enter_lazy_mmu_mode();
+       start_pte = pte_offset_map(pmd, addr);
+       for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
+               unsigned long pteval = pte_val(*pte);
+               if (pteval & _PAGE_HASHPTE)
+                       hpte_need_flush(mm, addr, pte, pteval, 0);
+               addr += PAGE_SIZE;
        }
        arch_leave_lazy_mmu_mode();
        local_irq_restore(flags);
index 6888cad5103dedfe78755f2267fda7cf0cf664ab..41cd68dee68164c38f3436ee7a40e60326ecc8cb 100644 (file)
@@ -648,7 +648,7 @@ void __init early_init_mmu(void)
        __early_init_mmu(1);
 }
 
-void __cpuinit early_init_mmu_secondary(void)
+void early_init_mmu_secondary(void)
 {
        __early_init_mmu(0);
 }
index 29c6482890c88c89f8fbb159d5561b591b47c09f..a3985aee77fec1ecd2fe9756257e3a46b9643b94 100644 (file)
@@ -75,6 +75,11 @@ static unsigned int freeze_events_kernel = MMCR0_FCS;
 
 #define MMCR0_FCHV             0
 #define MMCR0_PMCjCE           MMCR0_PMCnCE
+#define MMCR0_FC56             0
+#define MMCR0_PMAO             0
+#define MMCR0_EBE              0
+#define MMCR0_PMCC             0
+#define MMCR0_PMCC_U6          0
 
 #define SPRN_MMCRA             SPRN_MMCR2
 #define MMCRA_SAMPLE_ENABLE    0
@@ -102,6 +107,15 @@ static inline int siar_valid(struct pt_regs *regs)
        return 1;
 }
 
+static bool is_ebb_event(struct perf_event *event) { return false; }
+static int ebb_event_check(struct perf_event *event) { return 0; }
+static void ebb_event_add(struct perf_event *event) { }
+static void ebb_switch_out(unsigned long mmcr0) { }
+static unsigned long ebb_switch_in(bool ebb, unsigned long mmcr0)
+{
+       return mmcr0;
+}
+
 static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
 static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
 void power_pmu_flush_branch_stack(void) {}
@@ -462,6 +476,89 @@ void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw)
        return;
 }
 
+static bool is_ebb_event(struct perf_event *event)
+{
+       /*
+        * This could be a per-PMU callback, but we'd rather avoid the cost. We
+        * check that the PMU supports EBB, meaning those that don't can still
+        * use bit 63 of the event code for something else if they wish.
+        */
+       return (ppmu->flags & PPMU_EBB) &&
+              ((event->attr.config >> EVENT_CONFIG_EBB_SHIFT) & 1);
+}
+
+static int ebb_event_check(struct perf_event *event)
+{
+       struct perf_event *leader = event->group_leader;
+
+       /* Event and group leader must agree on EBB */
+       if (is_ebb_event(leader) != is_ebb_event(event))
+               return -EINVAL;
+
+       if (is_ebb_event(event)) {
+               if (!(event->attach_state & PERF_ATTACH_TASK))
+                       return -EINVAL;
+
+               if (!leader->attr.pinned || !leader->attr.exclusive)
+                       return -EINVAL;
+
+               if (event->attr.inherit || event->attr.sample_period ||
+                   event->attr.enable_on_exec || event->attr.freq)
+                       return -EINVAL;
+       }
+
+       return 0;
+}
+
+static void ebb_event_add(struct perf_event *event)
+{
+       if (!is_ebb_event(event) || current->thread.used_ebb)
+               return;
+
+       /*
+        * IFF this is the first time we've added an EBB event, set
+        * PMXE in the user MMCR0 so we can detect when it's cleared by
+        * userspace. We need this so that we can context switch while
+        * userspace is in the EBB handler (where PMXE is 0).
+        */
+       current->thread.used_ebb = 1;
+       current->thread.mmcr0 |= MMCR0_PMXE;
+}
+
+static void ebb_switch_out(unsigned long mmcr0)
+{
+       if (!(mmcr0 & MMCR0_EBE))
+               return;
+
+       current->thread.siar  = mfspr(SPRN_SIAR);
+       current->thread.sier  = mfspr(SPRN_SIER);
+       current->thread.sdar  = mfspr(SPRN_SDAR);
+       current->thread.mmcr0 = mmcr0 & MMCR0_USER_MASK;
+       current->thread.mmcr2 = mfspr(SPRN_MMCR2) & MMCR2_USER_MASK;
+}
+
+static unsigned long ebb_switch_in(bool ebb, unsigned long mmcr0)
+{
+       if (!ebb)
+               goto out;
+
+       /* Enable EBB and read/write to all 6 PMCs for userspace */
+       mmcr0 |= MMCR0_EBE | MMCR0_PMCC_U6;
+
+       /* Add any bits from the user reg, FC or PMAO */
+       mmcr0 |= current->thread.mmcr0;
+
+       /* Be careful not to set PMXE if userspace had it cleared */
+       if (!(current->thread.mmcr0 & MMCR0_PMXE))
+               mmcr0 &= ~MMCR0_PMXE;
+
+       mtspr(SPRN_SIAR, current->thread.siar);
+       mtspr(SPRN_SIER, current->thread.sier);
+       mtspr(SPRN_SDAR, current->thread.sdar);
+       mtspr(SPRN_MMCR2, current->thread.mmcr2);
+out:
+       return mmcr0;
+}
 #endif /* CONFIG_PPC64 */
 
 static void perf_event_interrupt(struct pt_regs *regs);
@@ -732,6 +829,13 @@ static void power_pmu_read(struct perf_event *event)
 
        if (!event->hw.idx)
                return;
+
+       if (is_ebb_event(event)) {
+               val = read_pmc(event->hw.idx);
+               local64_set(&event->hw.prev_count, val);
+               return;
+       }
+
        /*
         * Performance monitor interrupts come even when interrupts
         * are soft-disabled, as long as interrupts are hard-enabled.
@@ -852,7 +956,7 @@ static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
 static void power_pmu_disable(struct pmu *pmu)
 {
        struct cpu_hw_events *cpuhw;
-       unsigned long flags;
+       unsigned long flags, mmcr0, val;
 
        if (!ppmu)
                return;
@@ -860,9 +964,6 @@ static void power_pmu_disable(struct pmu *pmu)
        cpuhw = &__get_cpu_var(cpu_hw_events);
 
        if (!cpuhw->disabled) {
-               cpuhw->disabled = 1;
-               cpuhw->n_added = 0;
-
                /*
                 * Check if we ever enabled the PMU on this cpu.
                 */
@@ -871,6 +972,21 @@ static void power_pmu_disable(struct pmu *pmu)
                        cpuhw->pmcs_enabled = 1;
                }
 
+               /*
+                * Set the 'freeze counters' bit, clear EBE/PMCC/PMAO/FC56.
+                */
+               val  = mmcr0 = mfspr(SPRN_MMCR0);
+               val |= MMCR0_FC;
+               val &= ~(MMCR0_EBE | MMCR0_PMCC | MMCR0_PMAO | MMCR0_FC56);
+
+               /*
+                * The barrier is to make sure the mtspr has been
+                * executed and the PMU has frozen the events etc.
+                * before we return.
+                */
+               write_mmcr0(cpuhw, val);
+               mb();
+
                /*
                 * Disable instruction sampling if it was enabled
                 */
@@ -880,15 +996,12 @@ static void power_pmu_disable(struct pmu *pmu)
                        mb();
                }
 
-               /*
-                * Set the 'freeze counters' bit.
-                * The barrier is to make sure the mtspr has been
-                * executed and the PMU has frozen the events
-                * before we return.
-                */
-               write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
-               mb();
+               cpuhw->disabled = 1;
+               cpuhw->n_added = 0;
+
+               ebb_switch_out(mmcr0);
        }
+
        local_irq_restore(flags);
 }
 
@@ -903,22 +1016,35 @@ static void power_pmu_enable(struct pmu *pmu)
        struct cpu_hw_events *cpuhw;
        unsigned long flags;
        long i;
-       unsigned long val;
+       unsigned long val, mmcr0;
        s64 left;
        unsigned int hwc_index[MAX_HWEVENTS];
        int n_lim;
        int idx;
+       bool ebb;
 
        if (!ppmu)
                return;
        local_irq_save(flags);
+
        cpuhw = &__get_cpu_var(cpu_hw_events);
-       if (!cpuhw->disabled) {
-               local_irq_restore(flags);
-               return;
+       if (!cpuhw->disabled)
+               goto out;
+
+       if (cpuhw->n_events == 0) {
+               ppc_set_pmu_inuse(0);
+               goto out;
        }
+
        cpuhw->disabled = 0;
 
+       /*
+        * EBB requires an exclusive group and all events must have the EBB
+        * flag set, or not set, so we can just check a single event. Also we
+        * know we have at least one event.
+        */
+       ebb = is_ebb_event(cpuhw->event[0]);
+
        /*
         * If we didn't change anything, or only removed events,
         * no need to recalculate MMCR* settings and reset the PMCs.
@@ -928,8 +1054,6 @@ static void power_pmu_enable(struct pmu *pmu)
        if (!cpuhw->n_added) {
                mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
                mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
-               if (cpuhw->n_events == 0)
-                       ppc_set_pmu_inuse(0);
                goto out_enable;
        }
 
@@ -996,25 +1120,34 @@ static void power_pmu_enable(struct pmu *pmu)
                        ++n_lim;
                        continue;
                }
-               val = 0;
-               if (event->hw.sample_period) {
-                       left = local64_read(&event->hw.period_left);
-                       if (left < 0x80000000L)
-                               val = 0x80000000L - left;
+
+               if (ebb)
+                       val = local64_read(&event->hw.prev_count);
+               else {
+                       val = 0;
+                       if (event->hw.sample_period) {
+                               left = local64_read(&event->hw.period_left);
+                               if (left < 0x80000000L)
+                                       val = 0x80000000L - left;
+                       }
+                       local64_set(&event->hw.prev_count, val);
                }
-               local64_set(&event->hw.prev_count, val);
+
                event->hw.idx = idx;
                if (event->hw.state & PERF_HES_STOPPED)
                        val = 0;
                write_pmc(idx, val);
+
                perf_event_update_userpage(event);
        }
        cpuhw->n_limited = n_lim;
        cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
 
  out_enable:
+       mmcr0 = ebb_switch_in(ebb, cpuhw->mmcr[0]);
+
        mb();
-       write_mmcr0(cpuhw, cpuhw->mmcr[0]);
+       write_mmcr0(cpuhw, mmcr0);
 
        /*
         * Enable instruction sampling if necessary
@@ -1112,6 +1245,8 @@ static int power_pmu_add(struct perf_event *event, int ef_flags)
        event->hw.config = cpuhw->events[n0];
 
 nocheck:
+       ebb_event_add(event);
+
        ++cpuhw->n_events;
        ++cpuhw->n_added;
 
@@ -1472,6 +1607,11 @@ static int power_pmu_event_init(struct perf_event *event)
                }
        }
 
+       /* Extra checks for EBB */
+       err = ebb_event_check(event);
+       if (err)
+               return err;
+
        /*
         * If this is in a group, check if it can go on with all the
         * other hardware events in the group.  We assume the event
@@ -1510,6 +1650,13 @@ static int power_pmu_event_init(struct perf_event *event)
        event->hw.last_period = event->hw.sample_period;
        local64_set(&event->hw.period_left, event->hw.last_period);
 
+       /*
+        * For EBB events we just context switch the PMC value, we don't do any
+        * of the sample_period logic. We use hw.prev_count for this.
+        */
+       if (is_ebb_event(event))
+               local64_set(&event->hw.prev_count, 0);
+
        /*
         * See if we need to reserve the PMU.
         * If no events are currently in use, then we have to take a
@@ -1786,7 +1933,7 @@ static void power_pmu_setup(int cpu)
        cpuhw->mmcr[0] = MMCR0_FC;
 }
 
-static int __cpuinit
+static int
 power_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 {
        unsigned int cpu = (long)hcpu;
@@ -1803,7 +1950,7 @@ power_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu
        return NOTIFY_OK;
 }
 
-int __cpuinit register_power_pmu(struct power_pmu *pmu)
+int register_power_pmu(struct power_pmu *pmu)
 {
        if (ppmu)
                return -EBUSY;          /* something's already registered */
index f7d1c4fff30308c5febe047665ff95486a9f8412..96a64d6a8bdf3dfbba1c5a57f16bcb38d3c8a7ce 100644 (file)
@@ -31,9 +31,9 @@
  *
  *        60        56        52        48        44        40        36        32
  * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
- *                                     [      thresh_cmp     ]   [  thresh_ctl   ]
- *                                                                       |
- *                                       thresh start/stop OR FAB match -*
+ *   |                                 [      thresh_cmp     ]   [  thresh_ctl   ]
+ *   |                                                                   |
+ *   *- EBB (Linux)                      thresh start/stop OR FAB match -*
  *
  *        28        24        20        16        12         8         4         0
  * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
@@ -85,6 +85,7 @@
  *
  */
 
+#define EVENT_EBB_MASK         1ull
 #define EVENT_THR_CMP_SHIFT    40      /* Threshold CMP value */
 #define EVENT_THR_CMP_MASK     0x3ff
 #define EVENT_THR_CTL_SHIFT    32      /* Threshold control value (start/stop) */
 #define EVENT_IS_MARKED                (EVENT_MARKED_MASK << EVENT_MARKED_SHIFT)
 #define EVENT_PSEL_MASK                0xff    /* PMCxSEL value */
 
+#define EVENT_VALID_MASK       \
+       ((EVENT_THRESH_MASK    << EVENT_THRESH_SHIFT)           |       \
+        (EVENT_SAMPLE_MASK    << EVENT_SAMPLE_SHIFT)           |       \
+        (EVENT_CACHE_SEL_MASK << EVENT_CACHE_SEL_SHIFT)        |       \
+        (EVENT_PMC_MASK       << EVENT_PMC_SHIFT)              |       \
+        (EVENT_UNIT_MASK      << EVENT_UNIT_SHIFT)             |       \
+        (EVENT_COMBINE_MASK   << EVENT_COMBINE_SHIFT)          |       \
+        (EVENT_MARKED_MASK    << EVENT_MARKED_SHIFT)           |       \
+        (EVENT_EBB_MASK       << EVENT_CONFIG_EBB_SHIFT)       |       \
+         EVENT_PSEL_MASK)
+
 /* MMCRA IFM bits - POWER8 */
 #define        POWER8_MMCRA_IFM1               0x0000000040000000UL
 #define        POWER8_MMCRA_IFM2               0x0000000080000000UL
  *
  *        28        24        20        16        12         8         4         0
  * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
- *                       [ ]   [  sample ]   [     ]   [6] [5]   [4] [3]   [2] [1]
- *                        |                     |
- *      L1 I/D qualifier -*                     |      Count of events for each PMC.
- *                                              |        p1, p2, p3, p4, p5, p6.
+ *                   |   [ ]   [  sample ]   [     ]   [6] [5]   [4] [3]   [2] [1]
+ *              EBB -*    |                     |
+ *                        |                     |      Count of events for each PMC.
+ *      L1 I/D qualifier -*                     |        p1, p2, p3, p4, p5, p6.
  *                     nc - number of counters -*
  *
  * The PMC fields P1..P6, and NC, are adder fields. As we accumulate constraints
 #define CNST_THRESH_VAL(v)     (((v) & EVENT_THRESH_MASK) << 32)
 #define CNST_THRESH_MASK       CNST_THRESH_VAL(EVENT_THRESH_MASK)
 
+#define CNST_EBB_VAL(v)                (((v) & EVENT_EBB_MASK) << 24)
+#define CNST_EBB_MASK          CNST_EBB_VAL(EVENT_EBB_MASK)
+
 #define CNST_L1_QUAL_VAL(v)    (((v) & 3) << 22)
 #define CNST_L1_QUAL_MASK      CNST_L1_QUAL_VAL(3)
 
@@ -207,14 +222,21 @@ static inline bool event_is_fab_match(u64 event)
 
 static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
 {
-       unsigned int unit, pmc, cache;
+       unsigned int unit, pmc, cache, ebb;
        unsigned long mask, value;
 
        mask = value = 0;
 
-       pmc   = (event >> EVENT_PMC_SHIFT)       & EVENT_PMC_MASK;
-       unit  = (event >> EVENT_UNIT_SHIFT)      & EVENT_UNIT_MASK;
-       cache = (event >> EVENT_CACHE_SEL_SHIFT) & EVENT_CACHE_SEL_MASK;
+       if (event & ~EVENT_VALID_MASK)
+               return -1;
+
+       pmc   = (event >> EVENT_PMC_SHIFT)        & EVENT_PMC_MASK;
+       unit  = (event >> EVENT_UNIT_SHIFT)       & EVENT_UNIT_MASK;
+       cache = (event >> EVENT_CACHE_SEL_SHIFT)  & EVENT_CACHE_SEL_MASK;
+       ebb   = (event >> EVENT_CONFIG_EBB_SHIFT) & EVENT_EBB_MASK;
+
+       /* Clear the EBB bit in the event, so event checks work below */
+       event &= ~(EVENT_EBB_MASK << EVENT_CONFIG_EBB_SHIFT);
 
        if (pmc) {
                if (pmc > 6)
@@ -284,6 +306,18 @@ static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long
                value |= CNST_THRESH_VAL(event >> EVENT_THRESH_SHIFT);
        }
 
+       if (!pmc && ebb)
+               /* EBB events must specify the PMC */
+               return -1;
+
+       /*
+        * All events must agree on EBB, either all request it or none.
+        * EBB events are pinned & exclusive, so this should never actually
+        * hit, but we leave it as a fallback in case.
+        */
+       mask  |= CNST_EBB_VAL(ebb);
+       value |= CNST_EBB_MASK;
+
        *maskp = mask;
        *valp = value;
 
@@ -378,6 +412,10 @@ static int power8_compute_mmcr(u64 event[], int n_ev,
        if (pmc_inuse & 0x7c)
                mmcr[0] |= MMCR0_PMCjCE;
 
+       /* If we're not using PMC 5 or 6, freeze them */
+       if (!(pmc_inuse & 0x60))
+               mmcr[0] |= MMCR0_FC56;
+
        mmcr[1] = mmcr1;
        mmcr[2] = mmcra;
 
@@ -574,7 +612,7 @@ static struct power_pmu power8_pmu = {
        .get_constraint         = power8_get_constraint,
        .get_alternatives       = power8_get_alternatives,
        .disable_pmc            = power8_disable_pmc,
-       .flags                  = PPMU_HAS_SSLOT | PPMU_HAS_SIER | PPMU_BHRB,
+       .flags                  = PPMU_HAS_SSLOT | PPMU_HAS_SIER | PPMU_BHRB | PPMU_EBB,
        .n_generic              = ARRAY_SIZE(power8_generic_events),
        .generic_events         = power8_generic_events,
        .attr_groups            = power8_pmu_attr_groups,
index ecd3890c40d72f11147a642fdb39c3389a6a7a38..7f1b71a01c6a4e936da38573b72654e6e6ef6163 100644 (file)
@@ -91,12 +91,12 @@ static void __init ppc47x_init_irq(void)
 }
 
 #ifdef CONFIG_SMP
-static void __cpuinit smp_ppc47x_setup_cpu(int cpu)
+static void smp_ppc47x_setup_cpu(int cpu)
 {
        mpic_setup_this_cpu();
 }
 
-static int __cpuinit smp_ppc47x_kick_cpu(int cpu)
+static int smp_ppc47x_kick_cpu(int cpu)
 {
        struct device_node *cpunode = of_get_cpu_node(cpu, NULL);
        const u64 *spin_table_addr_prop;
@@ -176,13 +176,48 @@ static int __init ppc47x_probe(void)
        return 1;
 }
 
+static int board_rev = -1;
+static int __init ppc47x_get_board_rev(void)
+{
+       u8 fpga_reg0;
+       void *fpga;
+       struct device_node *np;
+
+       np = of_find_compatible_node(NULL, NULL, "ibm,currituck-fpga");
+       if (!np)
+               goto fail;
+
+       fpga = of_iomap(np, 0);
+       of_node_put(np);
+       if (!fpga)
+               goto fail;
+
+       fpga_reg0 = ioread8(fpga);
+       board_rev = fpga_reg0 & 0x03;
+       pr_info("%s: Found board revision %d\n", __func__, board_rev);
+       iounmap(fpga);
+       return 0;
+
+fail:
+       pr_info("%s: Unable to find board revision\n", __func__);
+       return 0;
+}
+machine_arch_initcall(ppc47x, ppc47x_get_board_rev);
+
 /* Use USB controller should have been hardware swizzled but it wasn't :( */
 static void ppc47x_pci_irq_fixup(struct pci_dev *dev)
 {
        if (dev->vendor == 0x1033 && (dev->device == 0x0035 ||
                                      dev->device == 0x00e0)) {
-               dev->irq = irq_create_mapping(NULL, 47);
-               pr_info("%s: Mapping irq 47 %d\n", __func__, dev->irq);
+               if (board_rev == 0) {
+                       dev->irq = irq_create_mapping(NULL, 47);
+                       pr_info("%s: Mapping irq %d\n", __func__, dev->irq);
+               } else if (board_rev == 2) {
+                       dev->irq = irq_create_mapping(NULL, 49);
+                       pr_info("%s: Mapping irq %d\n", __func__, dev->irq);
+               } else {
+                       pr_alert("%s: Unknown board revision\n", __func__);
+               }
        }
 }
 
index a28a8629727eed875133f168a7951b9337e58802..4241bc825800d72450bc994bacf2a4203fd9aa63 100644 (file)
@@ -81,12 +81,12 @@ static void __init iss4xx_init_irq(void)
 }
 
 #ifdef CONFIG_SMP
-static void __cpuinit smp_iss4xx_setup_cpu(int cpu)
+static void smp_iss4xx_setup_cpu(int cpu)
 {
        mpic_setup_this_cpu();
 }
 
-static int __cpuinit smp_iss4xx_kick_cpu(int cpu)
+static int smp_iss4xx_kick_cpu(int cpu)
 {
        struct device_node *cpunode = of_get_cpu_node(cpu, NULL);
        const u64 *spin_table_addr_prop;
index 6a1759939c6ba47fff97cae12ecc50a79643c606..5ced4f5bb2b2e70dbc4a2a90b1a4679b5ae802a2 100644 (file)
@@ -99,7 +99,7 @@ static void mpc85xx_take_timebase(void)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void __cpuinit smp_85xx_mach_cpu_die(void)
+static void smp_85xx_mach_cpu_die(void)
 {
        unsigned int cpu = smp_processor_id();
        u32 tmp;
@@ -141,7 +141,7 @@ static inline u32 read_spin_table_addr_l(void *spin_table)
        return in_be32(&((struct epapr_spin_table *)spin_table)->addr_l);
 }
 
-static int __cpuinit smp_85xx_kick_cpu(int nr)
+static int smp_85xx_kick_cpu(int nr)
 {
        unsigned long flags;
        const u64 *cpu_rel_addr;
@@ -362,7 +362,7 @@ static void mpc85xx_smp_machine_kexec(struct kimage *image)
 }
 #endif /* CONFIG_KEXEC */
 
-static void __cpuinit smp_85xx_setup_cpu(int cpu_nr)
+static void smp_85xx_setup_cpu(int cpu_nr)
 {
        if (smp_85xx_ops.probe == smp_mpic_probe)
                mpic_setup_this_cpu();
index 1e121088826fbd85316e076bcb1068a1706fba66..806cbbd86ec661fed502084c5cc514a06966185c 100644 (file)
@@ -43,6 +43,7 @@ static irqreturn_t timebase_interrupt(int irq, void *dev)
 
 static struct irqaction tbint_irqaction = {
        .handler = timebase_interrupt,
+       .flags = IRQF_NO_THREAD,
        .name = "tbint",
 };
 
index b62aab3e22ecd0447caf0db7ff97c2c16bc12644..bed8c607588c04685bc21c812503b333a5b0d8f6 100644 (file)
@@ -164,6 +164,11 @@ config IBMEBUS
        help
          Bus device driver for GX bus based adapters.
 
+config EEH
+       bool
+       depends on (PPC_POWERNV || PPC_PSERIES) && PCI
+       default y
+
 config PPC_MPC106
        bool
        default n
index 54f3936001aa1a51a1117b338a4ff06963d27411..ae0aaea9e09826178eba7ec26d133cd97d042a97 100644 (file)
@@ -71,6 +71,7 @@ config PPC_BOOK3S_64
        select PPC_FPU
        select PPC_HAVE_PMU_SUPPORT
        select SYS_SUPPORTS_HUGETLBFS
+       select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES
 
 config PPC_BOOK3E_64
        bool "Embedded processors"
index 246e1d8b3af3bf45877476a1df28e13caefc04ce..c34ee4e608734f387a0e43ed87abc1ba27e6dd27 100644 (file)
@@ -185,7 +185,8 @@ static void beat_lpar_hptab_clear(void)
 static long beat_lpar_hpte_updatepp(unsigned long slot,
                                    unsigned long newpp,
                                    unsigned long vpn,
-                                   int psize, int ssize, int local)
+                                   int psize, int apsize,
+                                   int ssize, int local)
 {
        unsigned long lpar_rc;
        u64 dummy0, dummy1;
@@ -274,7 +275,8 @@ static void beat_lpar_hpte_updateboltedpp(unsigned long newpp,
 }
 
 static void beat_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
-                                        int psize, int ssize, int local)
+                                     int psize, int apsize,
+                                     int ssize, int local)
 {
        unsigned long want_v;
        unsigned long lpar_rc;
@@ -364,9 +366,10 @@ static long beat_lpar_hpte_insert_v3(unsigned long hpte_group,
  * already zero.  For now I am paranoid.
  */
 static long beat_lpar_hpte_updatepp_v3(unsigned long slot,
-                                   unsigned long newpp,
-                                   unsigned long vpn,
-                                   int psize, int ssize, int local)
+                                      unsigned long newpp,
+                                      unsigned long vpn,
+                                      int psize, int apsize,
+                                      int ssize, int local)
 {
        unsigned long lpar_rc;
        unsigned long want_v;
@@ -394,7 +397,8 @@ static long beat_lpar_hpte_updatepp_v3(unsigned long slot,
 }
 
 static void beat_lpar_hpte_invalidate_v3(unsigned long slot, unsigned long vpn,
-                                        int psize, int ssize, int local)
+                                        int psize, int apsize,
+                                        int ssize, int local)
 {
        unsigned long want_v;
        unsigned long lpar_rc;
index d35dbbc8ec7919fe12287ad075b12c40312b4ffc..f75f6fcac7296267d44b8a28d3d2b16446878acc 100644 (file)
@@ -142,7 +142,7 @@ static int smp_cell_cpu_bootable(unsigned int nr)
         * during boot if the user requests it.  Odd-numbered
         * cpus are assumed to be secondary threads.
         */
-       if (system_state < SYSTEM_RUNNING &&
+       if (system_state == SYSTEM_BOOTING &&
            cpu_has_feature(CPU_FTR_SMT) &&
            !smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
                return 0;
index bdb738a69e41890b5bec785ee44257f26cb6f987..49c9f9501c21342e915bfb3c2456fe533fe2f03d 100644 (file)
@@ -885,7 +885,7 @@ static int smp_core99_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata smp_core99_cpu_nb = {
+static struct notifier_block smp_core99_cpu_nb = {
        .notifier_call  = smp_core99_cpu_notify,
 };
 #endif /* CONFIG_HOTPLUG_CPU */
index bcc3cb48a44ee341dc7fb871387ff3cf391fe90a..7fe595152478a08a756d89277a993cf2650d42ef 100644 (file)
@@ -3,3 +3,4 @@ obj-y                   += opal-rtc.o opal-nvram.o
 
 obj-$(CONFIG_SMP)      += smp.o
 obj-$(CONFIG_PCI)      += pci.o pci-p5ioc2.o pci-ioda.o
+obj-$(CONFIG_EEH)      += eeh-ioda.o eeh-powernv.o
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
new file mode 100644 (file)
index 0000000..0cd1c4a
--- /dev/null
@@ -0,0 +1,916 @@
+/*
+ * The file intends to implement the functions needed by EEH, which is
+ * built on IODA compliant chip. Actually, lots of functions related
+ * to EEH would be built based on the OPAL APIs.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/bootmem.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/msi.h>
+#include <linux/notifier.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/msi_bitmap.h>
+#include <asm/opal.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/tce.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+/* Debugging option */
+#ifdef IODA_EEH_DBG_ON
+#define IODA_EEH_DBG(args...)  pr_info(args)
+#else
+#define IODA_EEH_DBG(args...)
+#endif
+
+static char *hub_diag = NULL;
+static int ioda_eeh_nb_init = 0;
+
+static int ioda_eeh_event(struct notifier_block *nb,
+                         unsigned long events, void *change)
+{
+       uint64_t changed_evts = (uint64_t)change;
+
+       /* We simply send special EEH event */
+       if ((changed_evts & OPAL_EVENT_PCI_ERROR) &&
+           (events & OPAL_EVENT_PCI_ERROR))
+               eeh_send_failure_event(NULL);
+
+       return 0;
+}
+
+static struct notifier_block ioda_eeh_nb = {
+       .notifier_call  = ioda_eeh_event,
+       .next           = NULL,
+       .priority       = 0
+};
+
+#ifdef CONFIG_DEBUG_FS
+static int ioda_eeh_dbgfs_set(void *data, u64 val)
+{
+       struct pci_controller *hose = data;
+       struct pnv_phb *phb = hose->private_data;
+
+       out_be64(phb->regs + 0xD10, val);
+       return 0;
+}
+
+static int ioda_eeh_dbgfs_get(void *data, u64 *val)
+{
+       struct pci_controller *hose = data;
+       struct pnv_phb *phb = hose->private_data;
+
+       *val = in_be64(phb->regs + 0xD10);
+       return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_dbgfs_ops, ioda_eeh_dbgfs_get,
+                       ioda_eeh_dbgfs_set, "0x%llx\n");
+#endif /* CONFIG_DEBUG_FS */
+
+/**
+ * ioda_eeh_post_init - Chip dependent post initialization
+ * @hose: PCI controller
+ *
+ * The function will be called after eeh PEs and devices
+ * have been built. That means the EEH is ready to supply
+ * service with I/O cache.
+ */
+static int ioda_eeh_post_init(struct pci_controller *hose)
+{
+       struct pnv_phb *phb = hose->private_data;
+       int ret;
+
+       /* Register OPAL event notifier */
+       if (!ioda_eeh_nb_init) {
+               ret = opal_notifier_register(&ioda_eeh_nb);
+               if (ret) {
+                       pr_err("%s: Can't register OPAL event notifier (%d)\n",
+                              __func__, ret);
+                       return ret;
+               }
+
+               ioda_eeh_nb_init = 1;
+       }
+
+       /* FIXME: Enable it for PHB3 later */
+       if (phb->type == PNV_PHB_IODA1) {
+               if (!hub_diag) {
+                       hub_diag = (char *)__get_free_page(GFP_KERNEL |
+                                                          __GFP_ZERO);
+                       if (!hub_diag) {
+                               pr_err("%s: Out of memory !\n",
+                                      __func__);
+                               return -ENOMEM;
+                       }
+               }
+
+#ifdef CONFIG_DEBUG_FS
+               if (phb->dbgfs)
+                       debugfs_create_file("err_injct", 0600,
+                                           phb->dbgfs, hose,
+                                           &ioda_eeh_dbgfs_ops);
+#endif
+
+               phb->eeh_state |= PNV_EEH_STATE_ENABLED;
+       }
+
+       return 0;
+}
+
+/**
+ * ioda_eeh_set_option - Set EEH operation or I/O setting
+ * @pe: EEH PE
+ * @option: options
+ *
+ * Enable or disable EEH option for the indicated PE. The
+ * function also can be used to enable I/O or DMA for the
+ * PE.
+ */
+static int ioda_eeh_set_option(struct eeh_pe *pe, int option)
+{
+       s64 ret;
+       u32 pe_no;
+       struct pci_controller *hose = pe->phb;
+       struct pnv_phb *phb = hose->private_data;
+
+       /* Check on PE number */
+       if (pe->addr < 0 || pe->addr >= phb->ioda.total_pe) {
+               pr_err("%s: PE address %x out of range [0, %x] "
+                      "on PHB#%x\n",
+                       __func__, pe->addr, phb->ioda.total_pe,
+                       hose->global_number);
+               return -EINVAL;
+       }
+
+       pe_no = pe->addr;
+       switch (option) {
+       case EEH_OPT_DISABLE:
+               ret = -EEXIST;
+               break;
+       case EEH_OPT_ENABLE:
+               ret = 0;
+               break;
+       case EEH_OPT_THAW_MMIO:
+               ret = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
+                               OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO);
+               if (ret) {
+                       pr_warning("%s: Failed to enable MMIO for "
+                                  "PHB#%x-PE#%x, err=%lld\n",
+                               __func__, hose->global_number, pe_no, ret);
+                       return -EIO;
+               }
+
+               break;
+       case EEH_OPT_THAW_DMA:
+               ret = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
+                               OPAL_EEH_ACTION_CLEAR_FREEZE_DMA);
+               if (ret) {
+                       pr_warning("%s: Failed to enable DMA for "
+                                  "PHB#%x-PE#%x, err=%lld\n",
+                               __func__, hose->global_number, pe_no, ret);
+                       return -EIO;
+               }
+
+               break;
+       default:
+               pr_warning("%s: Invalid option %d\n", __func__, option);
+               return -EINVAL;
+       }
+
+       return ret;
+}
+
+/**
+ * ioda_eeh_get_state - Retrieve the state of PE
+ * @pe: EEH PE
+ *
+ * The PE's state should be retrieved from the PEEV, PEST
+ * IODA tables. Since the OPAL has exported the function
+ * to do it, it'd better to use that.
+ */
+static int ioda_eeh_get_state(struct eeh_pe *pe)
+{
+       s64 ret = 0;
+       u8 fstate;
+       u16 pcierr;
+       u32 pe_no;
+       int result;
+       struct pci_controller *hose = pe->phb;
+       struct pnv_phb *phb = hose->private_data;
+
+       /*
+        * Sanity check on PE address. The PHB PE address should
+        * be zero.
+        */
+       if (pe->addr < 0 || pe->addr >= phb->ioda.total_pe) {
+               pr_err("%s: PE address %x out of range [0, %x] "
+                      "on PHB#%x\n",
+                      __func__, pe->addr, phb->ioda.total_pe,
+                      hose->global_number);
+               return EEH_STATE_NOT_SUPPORT;
+       }
+
+       /* Retrieve PE status through OPAL */
+       pe_no = pe->addr;
+       ret = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
+                       &fstate, &pcierr, NULL);
+       if (ret) {
+               pr_err("%s: Failed to get EEH status on "
+                      "PHB#%x-PE#%x\n, err=%lld\n",
+                      __func__, hose->global_number, pe_no, ret);
+               return EEH_STATE_NOT_SUPPORT;
+       }
+
+       /* Check PHB status */
+       if (pe->type & EEH_PE_PHB) {
+               result = 0;
+               result &= ~EEH_STATE_RESET_ACTIVE;
+
+               if (pcierr != OPAL_EEH_PHB_ERROR) {
+                       result |= EEH_STATE_MMIO_ACTIVE;
+                       result |= EEH_STATE_DMA_ACTIVE;
+                       result |= EEH_STATE_MMIO_ENABLED;
+                       result |= EEH_STATE_DMA_ENABLED;
+               }
+
+               return result;
+       }
+
+       /* Parse result out */
+       result = 0;
+       switch (fstate) {
+       case OPAL_EEH_STOPPED_NOT_FROZEN:
+               result &= ~EEH_STATE_RESET_ACTIVE;
+               result |= EEH_STATE_MMIO_ACTIVE;
+               result |= EEH_STATE_DMA_ACTIVE;
+               result |= EEH_STATE_MMIO_ENABLED;
+               result |= EEH_STATE_DMA_ENABLED;
+               break;
+       case OPAL_EEH_STOPPED_MMIO_FREEZE:
+               result &= ~EEH_STATE_RESET_ACTIVE;
+               result |= EEH_STATE_DMA_ACTIVE;
+               result |= EEH_STATE_DMA_ENABLED;
+               break;
+       case OPAL_EEH_STOPPED_DMA_FREEZE:
+               result &= ~EEH_STATE_RESET_ACTIVE;
+               result |= EEH_STATE_MMIO_ACTIVE;
+               result |= EEH_STATE_MMIO_ENABLED;
+               break;
+       case OPAL_EEH_STOPPED_MMIO_DMA_FREEZE:
+               result &= ~EEH_STATE_RESET_ACTIVE;
+               break;
+       case OPAL_EEH_STOPPED_RESET:
+               result |= EEH_STATE_RESET_ACTIVE;
+               break;
+       case OPAL_EEH_STOPPED_TEMP_UNAVAIL:
+               result |= EEH_STATE_UNAVAILABLE;
+               break;
+       case OPAL_EEH_STOPPED_PERM_UNAVAIL:
+               result |= EEH_STATE_NOT_SUPPORT;
+               break;
+       default:
+               pr_warning("%s: Unexpected EEH status 0x%x "
+                          "on PHB#%x-PE#%x\n",
+                          __func__, fstate, hose->global_number, pe_no);
+       }
+
+       return result;
+}
+
+static int ioda_eeh_pe_clear(struct eeh_pe *pe)
+{
+       struct pci_controller *hose;
+       struct pnv_phb *phb;
+       u32 pe_no;
+       u8 fstate;
+       u16 pcierr;
+       s64 ret;
+
+       pe_no = pe->addr;
+       hose = pe->phb;
+       phb = pe->phb->private_data;
+
+       /* Clear the EEH error on the PE */
+       ret = opal_pci_eeh_freeze_clear(phb->opal_id,
+                       pe_no, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+       if (ret) {
+               pr_err("%s: Failed to clear EEH error for "
+                      "PHB#%x-PE#%x, err=%lld\n",
+                      __func__, hose->global_number, pe_no, ret);
+               return -EIO;
+       }
+
+       /*
+        * Read the PE state back and verify that the frozen
+        * state has been removed.
+        */
+       ret = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
+                       &fstate, &pcierr, NULL);
+       if (ret) {
+               pr_err("%s: Failed to get EEH status on "
+                      "PHB#%x-PE#%x\n, err=%lld\n",
+                      __func__, hose->global_number, pe_no, ret);
+               return -EIO;
+       }
+
+       if (fstate != OPAL_EEH_STOPPED_NOT_FROZEN) {
+               pr_err("%s: Frozen state not cleared on "
+                      "PHB#%x-PE#%x, sts=%x\n",
+                      __func__, hose->global_number, pe_no, fstate);
+               return -EIO;
+       }
+
+       return 0;
+}
+
+static s64 ioda_eeh_phb_poll(struct pnv_phb *phb)
+{
+       s64 rc = OPAL_HARDWARE;
+
+       while (1) {
+               rc = opal_pci_poll(phb->opal_id);
+               if (rc <= 0)
+                       break;
+
+               msleep(rc);
+       }
+
+       return rc;
+}
+
+static int ioda_eeh_phb_reset(struct pci_controller *hose, int option)
+{
+       struct pnv_phb *phb = hose->private_data;
+       s64 rc = OPAL_HARDWARE;
+
+       pr_debug("%s: Reset PHB#%x, option=%d\n",
+                __func__, hose->global_number, option);
+
+       /* Issue PHB complete reset request */
+       if (option == EEH_RESET_FUNDAMENTAL ||
+           option == EEH_RESET_HOT)
+               rc = opal_pci_reset(phb->opal_id,
+                               OPAL_PHB_COMPLETE,
+                               OPAL_ASSERT_RESET);
+       else if (option == EEH_RESET_DEACTIVATE)
+               rc = opal_pci_reset(phb->opal_id,
+                               OPAL_PHB_COMPLETE,
+                               OPAL_DEASSERT_RESET);
+       if (rc < 0)
+               goto out;
+
+       /*
+        * Poll state of the PHB until the request is done
+        * successfully.
+        */
+       rc = ioda_eeh_phb_poll(phb);
+out:
+       if (rc != OPAL_SUCCESS)
+               return -EIO;
+
+       return 0;
+}
+
+static int ioda_eeh_root_reset(struct pci_controller *hose, int option)
+{
+       struct pnv_phb *phb = hose->private_data;
+       s64 rc = OPAL_SUCCESS;
+
+       pr_debug("%s: Reset PHB#%x, option=%d\n",
+                __func__, hose->global_number, option);
+
+       /*
+        * During the reset deassert time, we needn't care
+        * the reset scope because the firmware does nothing
+        * for fundamental or hot reset during deassert phase.
+        */
+       if (option == EEH_RESET_FUNDAMENTAL)
+               rc = opal_pci_reset(phb->opal_id,
+                               OPAL_PCI_FUNDAMENTAL_RESET,
+                               OPAL_ASSERT_RESET);
+       else if (option == EEH_RESET_HOT)
+               rc = opal_pci_reset(phb->opal_id,
+                               OPAL_PCI_HOT_RESET,
+                               OPAL_ASSERT_RESET);
+       else if (option == EEH_RESET_DEACTIVATE)
+               rc = opal_pci_reset(phb->opal_id,
+                               OPAL_PCI_HOT_RESET,
+                               OPAL_DEASSERT_RESET);
+       if (rc < 0)
+               goto out;
+
+       /* Poll state of the PHB until the request is done */
+       rc = ioda_eeh_phb_poll(phb);
+out:
+       if (rc != OPAL_SUCCESS)
+               return -EIO;
+
+       return 0;
+}
+
+static int ioda_eeh_bridge_reset(struct pci_controller *hose,
+               struct pci_dev *dev, int option)
+{
+       u16 ctrl;
+
+       pr_debug("%s: Reset device %04x:%02x:%02x.%01x with option %d\n",
+                __func__, hose->global_number, dev->bus->number,
+                PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn), option);
+
+       switch (option) {
+       case EEH_RESET_FUNDAMENTAL:
+       case EEH_RESET_HOT:
+               pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl);
+               ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
+               pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl);
+               break;
+       case EEH_RESET_DEACTIVATE:
+               pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl);
+               ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
+               pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl);
+               break;
+       }
+
+       return 0;
+}
+
+/**
+ * ioda_eeh_reset - Reset the indicated PE
+ * @pe: EEH PE
+ * @option: reset option
+ *
+ * Do reset on the indicated PE. For PCI bus sensitive PE,
+ * we need to reset the parent p2p bridge. The PHB has to
+ * be reinitialized if the p2p bridge is root bridge. For
+ * PCI device sensitive PE, we will try to reset the device
+ * through FLR. For now, we don't have OPAL APIs to do HARD
+ * reset yet, so all reset would be SOFT (HOT) reset.
+ */
+static int ioda_eeh_reset(struct eeh_pe *pe, int option)
+{
+       struct pci_controller *hose = pe->phb;
+       struct eeh_dev *edev;
+       struct pci_dev *dev;
+       int ret;
+
+       /*
+        * Anyway, we have to clear the problematic state for the
+        * corresponding PE. However, we needn't do it if the PE
+        * is PHB associated. That means the PHB is having fatal
+        * errors and it needs reset. Further more, the AIB interface
+        * isn't reliable any more.
+        */
+       if (!(pe->type & EEH_PE_PHB) &&
+           (option == EEH_RESET_HOT ||
+           option == EEH_RESET_FUNDAMENTAL)) {
+               ret = ioda_eeh_pe_clear(pe);
+               if (ret)
+                       return -EIO;
+       }
+
+       /*
+        * The rules applied to reset, either fundamental or hot reset:
+        *
+        * We always reset the direct upstream bridge of the PE. If the
+        * direct upstream bridge isn't root bridge, we always take hot
+        * reset no matter what option (fundamental or hot) is. Otherwise,
+        * we should do the reset according to the required option.
+        */
+       if (pe->type & EEH_PE_PHB) {
+               ret = ioda_eeh_phb_reset(hose, option);
+       } else {
+               if (pe->type & EEH_PE_DEVICE) {
+                       /*
+                        * If it's device PE, we didn't refer to the parent
+                        * PCI bus yet. So we have to figure it out indirectly.
+                        */
+                       edev = list_first_entry(&pe->edevs,
+                                       struct eeh_dev, list);
+                       dev = eeh_dev_to_pci_dev(edev);
+                       dev = dev->bus->self;
+               } else {
+                       /*
+                        * If it's bus PE, the parent PCI bus is already there
+                        * and just pick it up.
+                        */
+                       dev = pe->bus->self;
+               }
+
+               /*
+                * Do reset based on the fact that the direct upstream bridge
+                * is root bridge (port) or not.
+                */
+               if (dev->bus->number == 0)
+                       ret = ioda_eeh_root_reset(hose, option);
+               else
+                       ret = ioda_eeh_bridge_reset(hose, dev, option);
+       }
+
+       return ret;
+}
+
+/**
+ * ioda_eeh_get_log - Retrieve error log
+ * @pe: EEH PE
+ * @severity: Severity level of the log
+ * @drv_log: buffer to store the log
+ * @len: space of the log buffer
+ *
+ * The function is used to retrieve error log from P7IOC.
+ */
+static int ioda_eeh_get_log(struct eeh_pe *pe, int severity,
+                           char *drv_log, unsigned long len)
+{
+       s64 ret;
+       unsigned long flags;
+       struct pci_controller *hose = pe->phb;
+       struct pnv_phb *phb = hose->private_data;
+
+       spin_lock_irqsave(&phb->lock, flags);
+
+       ret = opal_pci_get_phb_diag_data2(phb->opal_id,
+                       phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE);
+       if (ret) {
+               spin_unlock_irqrestore(&phb->lock, flags);
+               pr_warning("%s: Failed to get log for PHB#%x-PE#%x\n",
+                          __func__, hose->global_number, pe->addr);
+               return -EIO;
+       }
+
+       /*
+        * FIXME: We probably need log the error in somewhere.
+        * Lets make it up in future.
+        */
+       /* pr_info("%s", phb->diag.blob); */
+
+       spin_unlock_irqrestore(&phb->lock, flags);
+
+       return 0;
+}
+
+/**
+ * ioda_eeh_configure_bridge - Configure the PCI bridges for the indicated PE
+ * @pe: EEH PE
+ *
+ * For particular PE, it might have included PCI bridges. In order
+ * to make the PE work properly, those PCI bridges should be configured
+ * correctly. However, we need do nothing on P7IOC since the reset
+ * function will do everything that should be covered by the function.
+ */
+static int ioda_eeh_configure_bridge(struct eeh_pe *pe)
+{
+       return 0;
+}
+
+static void ioda_eeh_hub_diag_common(struct OpalIoP7IOCErrorData *data)
+{
+       /* GEM */
+       pr_info("  GEM XFIR:        %016llx\n", data->gemXfir);
+       pr_info("  GEM RFIR:        %016llx\n", data->gemRfir);
+       pr_info("  GEM RIRQFIR:     %016llx\n", data->gemRirqfir);
+       pr_info("  GEM Mask:        %016llx\n", data->gemMask);
+       pr_info("  GEM RWOF:        %016llx\n", data->gemRwof);
+
+       /* LEM */
+       pr_info("  LEM FIR:         %016llx\n", data->lemFir);
+       pr_info("  LEM Error Mask:  %016llx\n", data->lemErrMask);
+       pr_info("  LEM Action 0:    %016llx\n", data->lemAction0);
+       pr_info("  LEM Action 1:    %016llx\n", data->lemAction1);
+       pr_info("  LEM WOF:         %016llx\n", data->lemWof);
+}
+
+static void ioda_eeh_hub_diag(struct pci_controller *hose)
+{
+       struct pnv_phb *phb = hose->private_data;
+       struct OpalIoP7IOCErrorData *data;
+       long rc;
+
+       data = (struct OpalIoP7IOCErrorData *)ioda_eeh_hub_diag;
+       rc = opal_pci_get_hub_diag_data(phb->hub_id, data, PAGE_SIZE);
+       if (rc != OPAL_SUCCESS) {
+               pr_warning("%s: Failed to get HUB#%llx diag-data (%ld)\n",
+                          __func__, phb->hub_id, rc);
+               return;
+       }
+
+       switch (data->type) {
+       case OPAL_P7IOC_DIAG_TYPE_RGC:
+               pr_info("P7IOC diag-data for RGC\n\n");
+               ioda_eeh_hub_diag_common(data);
+               pr_info("  RGC Status:      %016llx\n", data->rgc.rgcStatus);
+               pr_info("  RGC LDCP:        %016llx\n", data->rgc.rgcLdcp);
+               break;
+       case OPAL_P7IOC_DIAG_TYPE_BI:
+               pr_info("P7IOC diag-data for BI %s\n\n",
+                       data->bi.biDownbound ? "Downbound" : "Upbound");
+               ioda_eeh_hub_diag_common(data);
+               pr_info("  BI LDCP 0:       %016llx\n", data->bi.biLdcp0);
+               pr_info("  BI LDCP 1:       %016llx\n", data->bi.biLdcp1);
+               pr_info("  BI LDCP 2:       %016llx\n", data->bi.biLdcp2);
+               pr_info("  BI Fence Status: %016llx\n", data->bi.biFenceStatus);
+               break;
+       case OPAL_P7IOC_DIAG_TYPE_CI:
+               pr_info("P7IOC diag-data for CI Port %d\\nn",
+                       data->ci.ciPort);
+               ioda_eeh_hub_diag_common(data);
+               pr_info("  CI Port Status:  %016llx\n", data->ci.ciPortStatus);
+               pr_info("  CI Port LDCP:    %016llx\n", data->ci.ciPortLdcp);
+               break;
+       case OPAL_P7IOC_DIAG_TYPE_MISC:
+               pr_info("P7IOC diag-data for MISC\n\n");
+               ioda_eeh_hub_diag_common(data);
+               break;
+       case OPAL_P7IOC_DIAG_TYPE_I2C:
+               pr_info("P7IOC diag-data for I2C\n\n");
+               ioda_eeh_hub_diag_common(data);
+               break;
+       default:
+               pr_warning("%s: Invalid type of HUB#%llx diag-data (%d)\n",
+                          __func__, phb->hub_id, data->type);
+       }
+}
+
+static void ioda_eeh_p7ioc_phb_diag(struct pci_controller *hose,
+                                   struct OpalIoPhbErrorCommon *common)
+{
+       struct OpalIoP7IOCPhbErrorData *data;
+       int i;
+
+       data = (struct OpalIoP7IOCPhbErrorData *)common;
+
+       pr_info("P7IOC PHB#%x Diag-data (Version: %d)\n\n",
+               hose->global_number, common->version);
+
+       pr_info("  brdgCtl:              %08x\n", data->brdgCtl);
+
+       pr_info("  portStatusReg:        %08x\n", data->portStatusReg);
+       pr_info("  rootCmplxStatus:      %08x\n", data->rootCmplxStatus);
+       pr_info("  busAgentStatus:       %08x\n", data->busAgentStatus);
+
+       pr_info("  deviceStatus:         %08x\n", data->deviceStatus);
+       pr_info("  slotStatus:           %08x\n", data->slotStatus);
+       pr_info("  linkStatus:           %08x\n", data->linkStatus);
+       pr_info("  devCmdStatus:         %08x\n", data->devCmdStatus);
+       pr_info("  devSecStatus:         %08x\n", data->devSecStatus);
+
+       pr_info("  rootErrorStatus:      %08x\n", data->rootErrorStatus);
+       pr_info("  uncorrErrorStatus:    %08x\n", data->uncorrErrorStatus);
+       pr_info("  corrErrorStatus:      %08x\n", data->corrErrorStatus);
+       pr_info("  tlpHdr1:              %08x\n", data->tlpHdr1);
+       pr_info("  tlpHdr2:              %08x\n", data->tlpHdr2);
+       pr_info("  tlpHdr3:              %08x\n", data->tlpHdr3);
+       pr_info("  tlpHdr4:              %08x\n", data->tlpHdr4);
+       pr_info("  sourceId:             %08x\n", data->sourceId);
+
+       pr_info("  errorClass:           %016llx\n", data->errorClass);
+       pr_info("  correlator:           %016llx\n", data->correlator);
+       pr_info("  p7iocPlssr:           %016llx\n", data->p7iocPlssr);
+       pr_info("  p7iocCsr:             %016llx\n", data->p7iocCsr);
+       pr_info("  lemFir:               %016llx\n", data->lemFir);
+       pr_info("  lemErrorMask:         %016llx\n", data->lemErrorMask);
+       pr_info("  lemWOF:               %016llx\n", data->lemWOF);
+       pr_info("  phbErrorStatus:       %016llx\n", data->phbErrorStatus);
+       pr_info("  phbFirstErrorStatus:  %016llx\n", data->phbFirstErrorStatus);
+       pr_info("  phbErrorLog0:         %016llx\n", data->phbErrorLog0);
+       pr_info("  phbErrorLog1:         %016llx\n", data->phbErrorLog1);
+       pr_info("  mmioErrorStatus:      %016llx\n", data->mmioErrorStatus);
+       pr_info("  mmioFirstErrorStatus: %016llx\n", data->mmioFirstErrorStatus);
+       pr_info("  mmioErrorLog0:        %016llx\n", data->mmioErrorLog0);
+       pr_info("  mmioErrorLog1:        %016llx\n", data->mmioErrorLog1);
+       pr_info("  dma0ErrorStatus:      %016llx\n", data->dma0ErrorStatus);
+       pr_info("  dma0FirstErrorStatus: %016llx\n", data->dma0FirstErrorStatus);
+       pr_info("  dma0ErrorLog0:        %016llx\n", data->dma0ErrorLog0);
+       pr_info("  dma0ErrorLog1:        %016llx\n", data->dma0ErrorLog1);
+       pr_info("  dma1ErrorStatus:      %016llx\n", data->dma1ErrorStatus);
+       pr_info("  dma1FirstErrorStatus: %016llx\n", data->dma1FirstErrorStatus);
+       pr_info("  dma1ErrorLog0:        %016llx\n", data->dma1ErrorLog0);
+       pr_info("  dma1ErrorLog1:        %016llx\n", data->dma1ErrorLog1);
+
+       for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) {
+               if ((data->pestA[i] >> 63) == 0 &&
+                   (data->pestB[i] >> 63) == 0)
+                       continue;
+
+               pr_info("  PE[%3d] PESTA:        %016llx\n", i, data->pestA[i]);
+               pr_info("          PESTB:        %016llx\n", data->pestB[i]);
+       }
+}
+
+static void ioda_eeh_phb_diag(struct pci_controller *hose)
+{
+       struct pnv_phb *phb = hose->private_data;
+       struct OpalIoPhbErrorCommon *common;
+       long rc;
+
+       common = (struct OpalIoPhbErrorCommon *)phb->diag.blob;
+       rc = opal_pci_get_phb_diag_data2(phb->opal_id, common, PAGE_SIZE);
+       if (rc != OPAL_SUCCESS) {
+               pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
+                           __func__, hose->global_number, rc);
+               return;
+       }
+
+       switch (common->ioType) {
+       case OPAL_PHB_ERROR_DATA_TYPE_P7IOC:
+               ioda_eeh_p7ioc_phb_diag(hose, common);
+               break;
+       default:
+               pr_warning("%s: Unrecognized I/O chip %d\n",
+                          __func__, common->ioType);
+       }
+}
+
+static int ioda_eeh_get_phb_pe(struct pci_controller *hose,
+                              struct eeh_pe **pe)
+{
+       struct eeh_pe *phb_pe;
+
+       phb_pe = eeh_phb_pe_get(hose);
+       if (!phb_pe) {
+               pr_warning("%s Can't find PE for PHB#%d\n",
+                          __func__, hose->global_number);
+               return -EEXIST;
+       }
+
+       *pe = phb_pe;
+       return 0;
+}
+
+static int ioda_eeh_get_pe(struct pci_controller *hose,
+                          u16 pe_no, struct eeh_pe **pe)
+{
+       struct eeh_pe *phb_pe, *dev_pe;
+       struct eeh_dev dev;
+
+       /* Find the PHB PE */
+       if (ioda_eeh_get_phb_pe(hose, &phb_pe))
+               return -EEXIST;
+
+       /* Find the PE according to PE# */
+       memset(&dev, 0, sizeof(struct eeh_dev));
+       dev.phb = hose;
+       dev.pe_config_addr = pe_no;
+       dev_pe = eeh_pe_get(&dev);
+       if (!dev_pe) {
+               pr_warning("%s: Can't find PE for PHB#%x - PE#%x\n",
+                          __func__, hose->global_number, pe_no);
+               return -EEXIST;
+       }
+
+       *pe = dev_pe;
+       return 0;
+}
+
+/**
+ * ioda_eeh_next_error - Retrieve next error for EEH core to handle
+ * @pe: The affected PE
+ *
+ * The function is expected to be called by EEH core while it gets
+ * special EEH event (without binding PE). The function calls to
+ * OPAL APIs for next error to handle. The informational error is
+ * handled internally by platform. However, the dead IOC, dead PHB,
+ * fenced PHB and frozen PE should be handled by EEH core eventually.
+ */
+static int ioda_eeh_next_error(struct eeh_pe **pe)
+{
+       struct pci_controller *hose, *tmp;
+       struct pnv_phb *phb;
+       u64 frozen_pe_no;
+       u16 err_type, severity;
+       long rc;
+       int ret = 1;
+
+       /*
+        * While running here, it's safe to purge the event queue.
+        * And we should keep the cached OPAL notifier event sychronized
+        * between the kernel and firmware.
+        */
+       eeh_remove_event(NULL);
+       opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
+
+       list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+               /*
+                * If the subordinate PCI buses of the PHB has been
+                * removed, we needn't take care of it any more.
+                */
+               phb = hose->private_data;
+               if (phb->eeh_state & PNV_EEH_STATE_REMOVED)
+                       continue;
+
+               rc = opal_pci_next_error(phb->opal_id,
+                               &frozen_pe_no, &err_type, &severity);
+
+               /* If OPAL API returns error, we needn't proceed */
+               if (rc != OPAL_SUCCESS) {
+                       IODA_EEH_DBG("%s: Invalid return value on "
+                                    "PHB#%x (0x%lx) from opal_pci_next_error",
+                                    __func__, hose->global_number, rc);
+                       continue;
+               }
+
+               /* If the PHB doesn't have error, stop processing */
+               if (err_type == OPAL_EEH_NO_ERROR ||
+                   severity == OPAL_EEH_SEV_NO_ERROR) {
+                       IODA_EEH_DBG("%s: No error found on PHB#%x\n",
+                                    __func__, hose->global_number);
+                       continue;
+               }
+
+               /*
+                * Processing the error. We're expecting the error with
+                * highest priority reported upon multiple errors on the
+                * specific PHB.
+                */
+               IODA_EEH_DBG("%s: Error (%d, %d, %d) on PHB#%x\n",
+                       err_type, severity, pe_no, hose->global_number);
+               switch (err_type) {
+               case OPAL_EEH_IOC_ERROR:
+                       if (severity == OPAL_EEH_SEV_IOC_DEAD) {
+                               list_for_each_entry_safe(hose, tmp,
+                                               &hose_list, list_node) {
+                                       phb = hose->private_data;
+                                       phb->eeh_state |= PNV_EEH_STATE_REMOVED;
+                               }
+
+                               pr_err("EEH: dead IOC detected\n");
+                               ret = 4;
+                               goto out;
+                       } else if (severity == OPAL_EEH_SEV_INF) {
+                               pr_info("EEH: IOC informative error "
+                                       "detected\n");
+                               ioda_eeh_hub_diag(hose);
+                       }
+
+                       break;
+               case OPAL_EEH_PHB_ERROR:
+                       if (severity == OPAL_EEH_SEV_PHB_DEAD) {
+                               if (ioda_eeh_get_phb_pe(hose, pe))
+                                       break;
+
+                               pr_err("EEH: dead PHB#%x detected\n",
+                                       hose->global_number);
+                               phb->eeh_state |= PNV_EEH_STATE_REMOVED;
+                               ret = 3;
+                               goto out;
+                       } else if (severity == OPAL_EEH_SEV_PHB_FENCED) {
+                               if (ioda_eeh_get_phb_pe(hose, pe))
+                                       break;
+
+                               pr_err("EEH: fenced PHB#%x detected\n",
+                                       hose->global_number);
+                               ret = 2;
+                               goto out;
+                       } else if (severity == OPAL_EEH_SEV_INF) {
+                               pr_info("EEH: PHB#%x informative error "
+                                       "detected\n",
+                                       hose->global_number);
+                               ioda_eeh_phb_diag(hose);
+                       }
+
+                       break;
+               case OPAL_EEH_PE_ERROR:
+                       if (ioda_eeh_get_pe(hose, frozen_pe_no, pe))
+                               break;
+
+                       pr_err("EEH: Frozen PE#%x on PHB#%x detected\n",
+                               (*pe)->addr, (*pe)->phb->global_number);
+                       ret = 1;
+                       goto out;
+               }
+       }
+
+       ret = 0;
+out:
+       return ret;
+}
+
+struct pnv_eeh_ops ioda_eeh_ops = {
+       .post_init              = ioda_eeh_post_init,
+       .set_option             = ioda_eeh_set_option,
+       .get_state              = ioda_eeh_get_state,
+       .reset                  = ioda_eeh_reset,
+       .get_log                = ioda_eeh_get_log,
+       .configure_bridge       = ioda_eeh_configure_bridge,
+       .next_error             = ioda_eeh_next_error
+};
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
new file mode 100644 (file)
index 0000000..969cce7
--- /dev/null
@@ -0,0 +1,379 @@
+/*
+ * The file intends to implement the platform dependent EEH operations on
+ * powernv platform. Actually, the powernv was created in order to fully
+ * hypervisor support.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/atomic.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/msi.h>
+#include <linux/of.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/firmware.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/machdep.h>
+#include <asm/msi_bitmap.h>
+#include <asm/opal.h>
+#include <asm/ppc-pci.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+/**
+ * powernv_eeh_init - EEH platform dependent initialization
+ *
+ * EEH platform dependent initialization on powernv
+ */
+static int powernv_eeh_init(void)
+{
+       /* We require OPALv3 */
+       if (!firmware_has_feature(FW_FEATURE_OPALv3)) {
+               pr_warning("%s: OPALv3 is required !\n", __func__);
+               return -EINVAL;
+       }
+
+       /* Set EEH probe mode */
+       eeh_probe_mode_set(EEH_PROBE_MODE_DEV);
+
+       return 0;
+}
+
+/**
+ * powernv_eeh_post_init - EEH platform dependent post initialization
+ *
+ * EEH platform dependent post initialization on powernv. When
+ * the function is called, the EEH PEs and devices should have
+ * been built. If the I/O cache staff has been built, EEH is
+ * ready to supply service.
+ */
+static int powernv_eeh_post_init(void)
+{
+       struct pci_controller *hose;
+       struct pnv_phb *phb;
+       int ret = 0;
+
+       list_for_each_entry(hose, &hose_list, list_node) {
+               phb = hose->private_data;
+
+               if (phb->eeh_ops && phb->eeh_ops->post_init) {
+                       ret = phb->eeh_ops->post_init(hose);
+                       if (ret)
+                               break;
+               }
+       }
+
+       return ret;
+}
+
+/**
+ * powernv_eeh_dev_probe - Do probe on PCI device
+ * @dev: PCI device
+ * @flag: unused
+ *
+ * When EEH module is installed during system boot, all PCI devices
+ * are checked one by one to see if it supports EEH. The function
+ * is introduced for the purpose. By default, EEH has been enabled
+ * on all PCI devices. That's to say, we only need do necessary
+ * initialization on the corresponding eeh device and create PE
+ * accordingly.
+ *
+ * It's notable that's unsafe to retrieve the EEH device through
+ * the corresponding PCI device. During the PCI device hotplug, which
+ * was possiblly triggered by EEH core, the binding between EEH device
+ * and the PCI device isn't built yet.
+ */
+static int powernv_eeh_dev_probe(struct pci_dev *dev, void *flag)
+{
+       struct pci_controller *hose = pci_bus_to_host(dev->bus);
+       struct pnv_phb *phb = hose->private_data;
+       struct device_node *dn = pci_device_to_OF_node(dev);
+       struct eeh_dev *edev = of_node_to_eeh_dev(dn);
+
+       /*
+        * When probing the root bridge, which doesn't have any
+        * subordinate PCI devices. We don't have OF node for
+        * the root bridge. So it's not reasonable to continue
+        * the probing.
+        */
+       if (!dn || !edev)
+               return 0;
+
+       /* Skip for PCI-ISA bridge */
+       if ((dev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
+               return 0;
+
+       /* Initialize eeh device */
+       edev->class_code        = dev->class;
+       edev->mode              = 0;
+       edev->config_addr       = ((dev->bus->number << 8) | dev->devfn);
+       edev->pe_config_addr    = phb->bdfn_to_pe(phb, dev->bus, dev->devfn & 0xff);
+
+       /* Create PE */
+       eeh_add_to_parent_pe(edev);
+
+       /*
+        * Enable EEH explicitly so that we will do EEH check
+        * while accessing I/O stuff
+        *
+        * FIXME: Enable that for PHB3 later
+        */
+       if (phb->type == PNV_PHB_IODA1)
+               eeh_subsystem_enabled = 1;
+
+       /* Save memory bars */
+       eeh_save_bars(edev);
+
+       return 0;
+}
+
+/**
+ * powernv_eeh_set_option - Initialize EEH or MMIO/DMA reenable
+ * @pe: EEH PE
+ * @option: operation to be issued
+ *
+ * The function is used to control the EEH functionality globally.
+ * Currently, following options are support according to PAPR:
+ * Enable EEH, Disable EEH, Enable MMIO and Enable DMA
+ */
+static int powernv_eeh_set_option(struct eeh_pe *pe, int option)
+{
+       struct pci_controller *hose = pe->phb;
+       struct pnv_phb *phb = hose->private_data;
+       int ret = -EEXIST;
+
+       /*
+        * What we need do is pass it down for hardware
+        * implementation to handle it.
+        */
+       if (phb->eeh_ops && phb->eeh_ops->set_option)
+               ret = phb->eeh_ops->set_option(pe, option);
+
+       return ret;
+}
+
+/**
+ * powernv_eeh_get_pe_addr - Retrieve PE address
+ * @pe: EEH PE
+ *
+ * Retrieve the PE address according to the given tranditional
+ * PCI BDF (Bus/Device/Function) address.
+ */
+static int powernv_eeh_get_pe_addr(struct eeh_pe *pe)
+{
+       return pe->addr;
+}
+
+/**
+ * powernv_eeh_get_state - Retrieve PE state
+ * @pe: EEH PE
+ * @delay: delay while PE state is temporarily unavailable
+ *
+ * Retrieve the state of the specified PE. For IODA-compitable
+ * platform, it should be retrieved from IODA table. Therefore,
+ * we prefer passing down to hardware implementation to handle
+ * it.
+ */
+static int powernv_eeh_get_state(struct eeh_pe *pe, int *delay)
+{
+       struct pci_controller *hose = pe->phb;
+       struct pnv_phb *phb = hose->private_data;
+       int ret = EEH_STATE_NOT_SUPPORT;
+
+       if (phb->eeh_ops && phb->eeh_ops->get_state) {
+               ret = phb->eeh_ops->get_state(pe);
+
+               /*
+                * If the PE state is temporarily unavailable,
+                * to inform the EEH core delay for default
+                * period (1 second)
+                */
+               if (delay) {
+                       *delay = 0;
+                       if (ret & EEH_STATE_UNAVAILABLE)
+                               *delay = 1000;
+               }
+       }
+
+       return ret;
+}
+
+/**
+ * powernv_eeh_reset - Reset the specified PE
+ * @pe: EEH PE
+ * @option: reset option
+ *
+ * Reset the specified PE
+ */
+static int powernv_eeh_reset(struct eeh_pe *pe, int option)
+{
+       struct pci_controller *hose = pe->phb;
+       struct pnv_phb *phb = hose->private_data;
+       int ret = -EEXIST;
+
+       if (phb->eeh_ops && phb->eeh_ops->reset)
+               ret = phb->eeh_ops->reset(pe, option);
+
+       return ret;
+}
+
+/**
+ * powernv_eeh_wait_state - Wait for PE state
+ * @pe: EEH PE
+ * @max_wait: maximal period in microsecond
+ *
+ * Wait for the state of associated PE. It might take some time
+ * to retrieve the PE's state.
+ */
+static int powernv_eeh_wait_state(struct eeh_pe *pe, int max_wait)
+{
+       int ret;
+       int mwait;
+
+       while (1) {
+               ret = powernv_eeh_get_state(pe, &mwait);
+
+               /*
+                * If the PE's state is temporarily unavailable,
+                * we have to wait for the specified time. Otherwise,
+                * the PE's state will be returned immediately.
+                */
+               if (ret != EEH_STATE_UNAVAILABLE)
+                       return ret;
+
+               max_wait -= mwait;
+               if (max_wait <= 0) {
+                       pr_warning("%s: Timeout getting PE#%x's state (%d)\n",
+                                  __func__, pe->addr, max_wait);
+                       return EEH_STATE_NOT_SUPPORT;
+               }
+
+               msleep(mwait);
+       }
+
+       return EEH_STATE_NOT_SUPPORT;
+}
+
+/**
+ * powernv_eeh_get_log - Retrieve error log
+ * @pe: EEH PE
+ * @severity: temporary or permanent error log
+ * @drv_log: driver log to be combined with retrieved error log
+ * @len: length of driver log
+ *
+ * Retrieve the temporary or permanent error from the PE.
+ */
+static int powernv_eeh_get_log(struct eeh_pe *pe, int severity,
+                       char *drv_log, unsigned long len)
+{
+       struct pci_controller *hose = pe->phb;
+       struct pnv_phb *phb = hose->private_data;
+       int ret = -EEXIST;
+
+       if (phb->eeh_ops && phb->eeh_ops->get_log)
+               ret = phb->eeh_ops->get_log(pe, severity, drv_log, len);
+
+       return ret;
+}
+
+/**
+ * powernv_eeh_configure_bridge - Configure PCI bridges in the indicated PE
+ * @pe: EEH PE
+ *
+ * The function will be called to reconfigure the bridges included
+ * in the specified PE so that the mulfunctional PE would be recovered
+ * again.
+ */
+static int powernv_eeh_configure_bridge(struct eeh_pe *pe)
+{
+       struct pci_controller *hose = pe->phb;
+       struct pnv_phb *phb = hose->private_data;
+       int ret = 0;
+
+       if (phb->eeh_ops && phb->eeh_ops->configure_bridge)
+               ret = phb->eeh_ops->configure_bridge(pe);
+
+       return ret;
+}
+
+/**
+ * powernv_eeh_next_error - Retrieve next EEH error to handle
+ * @pe: Affected PE
+ *
+ * Using OPAL API, to retrieve next EEH error for EEH core to handle
+ */
+static int powernv_eeh_next_error(struct eeh_pe **pe)
+{
+       struct pci_controller *hose;
+       struct pnv_phb *phb = NULL;
+
+       list_for_each_entry(hose, &hose_list, list_node) {
+               phb = hose->private_data;
+               break;
+       }
+
+       if (phb && phb->eeh_ops->next_error)
+               return phb->eeh_ops->next_error(pe);
+
+       return -EEXIST;
+}
+
+static struct eeh_ops powernv_eeh_ops = {
+       .name                   = "powernv",
+       .init                   = powernv_eeh_init,
+       .post_init              = powernv_eeh_post_init,
+       .of_probe               = NULL,
+       .dev_probe              = powernv_eeh_dev_probe,
+       .set_option             = powernv_eeh_set_option,
+       .get_pe_addr            = powernv_eeh_get_pe_addr,
+       .get_state              = powernv_eeh_get_state,
+       .reset                  = powernv_eeh_reset,
+       .wait_state             = powernv_eeh_wait_state,
+       .get_log                = powernv_eeh_get_log,
+       .configure_bridge       = powernv_eeh_configure_bridge,
+       .read_config            = pnv_pci_cfg_read,
+       .write_config           = pnv_pci_cfg_write,
+       .next_error             = powernv_eeh_next_error
+};
+
+/**
+ * eeh_powernv_init - Register platform dependent EEH operations
+ *
+ * EEH initialization on powernv platform. This function should be
+ * called before any EEH related functions.
+ */
+static int __init eeh_powernv_init(void)
+{
+       int ret = -EINVAL;
+
+       if (!machine_is(powernv))
+               return ret;
+
+       ret = eeh_ops_register(&powernv_eeh_ops);
+       if (!ret)
+               pr_info("EEH: PowerNV platform initialized\n");
+       else
+               pr_info("EEH: Failed to initialize PowerNV platform (%d)\n", ret);
+
+       return ret;
+}
+
+early_initcall(eeh_powernv_init);
index 6fabe92eafb6a122d0286c9a3dd70e1e4eeb793e..e88863ffb13543d15d6812942a5d9560c69f5c0b 100644 (file)
@@ -107,4 +107,7 @@ OPAL_CALL(opal_pci_mask_pe_error,           OPAL_PCI_MASK_PE_ERROR);
 OPAL_CALL(opal_set_slot_led_status,            OPAL_SET_SLOT_LED_STATUS);
 OPAL_CALL(opal_get_epow_status,                        OPAL_GET_EPOW_STATUS);
 OPAL_CALL(opal_set_system_attention_led,       OPAL_SET_SYSTEM_ATTENTION_LED);
+OPAL_CALL(opal_pci_next_error,                 OPAL_PCI_NEXT_ERROR);
+OPAL_CALL(opal_pci_poll,                       OPAL_PCI_POLL);
 OPAL_CALL(opal_pci_msi_eoi,                    OPAL_PCI_MSI_EOI);
+OPAL_CALL(opal_pci_get_phb_diag_data2,         OPAL_PCI_GET_PHB_DIAG_DATA2);
index 628c564ceadbb32b1c96287f3ba5cb43390cc56a..106301fd2fa590a48b8cce46d0ba5b20013a9412 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/of.h>
 #include <linux/of_platform.h>
 #include <linux/interrupt.h>
+#include <linux/notifier.h>
 #include <linux/slab.h>
 #include <asm/opal.h>
 #include <asm/firmware.h>
@@ -31,6 +32,10 @@ static DEFINE_SPINLOCK(opal_write_lock);
 extern u64 opal_mc_secondary_handler[];
 static unsigned int *opal_irqs;
 static unsigned int opal_irq_count;
+static ATOMIC_NOTIFIER_HEAD(opal_notifier_head);
+static DEFINE_SPINLOCK(opal_notifier_lock);
+static uint64_t last_notified_mask = 0x0ul;
+static atomic_t opal_notifier_hold = ATOMIC_INIT(0);
 
 int __init early_init_dt_scan_opal(unsigned long node,
                                   const char *uname, int depth, void *data)
@@ -95,6 +100,68 @@ static int __init opal_register_exception_handlers(void)
 
 early_initcall(opal_register_exception_handlers);
 
+int opal_notifier_register(struct notifier_block *nb)
+{
+       if (!nb) {
+               pr_warning("%s: Invalid argument (%p)\n",
+                          __func__, nb);
+               return -EINVAL;
+       }
+
+       atomic_notifier_chain_register(&opal_notifier_head, nb);
+       return 0;
+}
+
+static void opal_do_notifier(uint64_t events)
+{
+       unsigned long flags;
+       uint64_t changed_mask;
+
+       if (atomic_read(&opal_notifier_hold))
+               return;
+
+       spin_lock_irqsave(&opal_notifier_lock, flags);
+       changed_mask = last_notified_mask ^ events;
+       last_notified_mask = events;
+       spin_unlock_irqrestore(&opal_notifier_lock, flags);
+
+       /*
+        * We feed with the event bits and changed bits for
+        * enough information to the callback.
+        */
+       atomic_notifier_call_chain(&opal_notifier_head,
+                                  events, (void *)changed_mask);
+}
+
+void opal_notifier_update_evt(uint64_t evt_mask,
+                             uint64_t evt_val)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&opal_notifier_lock, flags);
+       last_notified_mask &= ~evt_mask;
+       last_notified_mask |= evt_val;
+       spin_unlock_irqrestore(&opal_notifier_lock, flags);
+}
+
+void opal_notifier_enable(void)
+{
+       int64_t rc;
+       uint64_t evt = 0;
+
+       atomic_set(&opal_notifier_hold, 0);
+
+       /* Process pending events */
+       rc = opal_poll_events(&evt);
+       if (rc == OPAL_SUCCESS && evt)
+               opal_do_notifier(evt);
+}
+
+void opal_notifier_disable(void)
+{
+       atomic_set(&opal_notifier_hold, 1);
+}
+
 int opal_get_chars(uint32_t vtermno, char *buf, int count)
 {
        s64 len, rc;
@@ -297,7 +364,7 @@ static irqreturn_t opal_interrupt(int irq, void *data)
 
        opal_handle_interrupt(virq_to_hw(irq), &events);
 
-       /* XXX TODO: Do something with the events */
+       opal_do_notifier(events);
 
        return IRQ_HANDLED;
 }
index 9c9d15e4cdf2700f803471667f83720430844c68..c393bf59f1138d2b629ff86dde2f5348ed6af170 100644 (file)
@@ -13,6 +13,7 @@
 
 #include <linux/kernel.h>
 #include <linux/pci.h>
+#include <linux/debugfs.h>
 #include <linux/delay.h>
 #include <linux/string.h>
 #include <linux/init.h>
@@ -32,6 +33,7 @@
 #include <asm/iommu.h>
 #include <asm/tce.h>
 #include <asm/xics.h>
+#include <asm/debug.h>
 
 #include "powernv.h"
 #include "pci.h"
@@ -595,6 +597,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
                               TCE_PCI_SWINV_PAIR;
        }
        iommu_init_table(tbl, phb->hose->node);
+       iommu_register_group(tbl, pci_domain_nr(pe->pbus), pe->pe_number);
 
        return;
  fail:
@@ -968,11 +971,38 @@ static void pnv_pci_ioda_setup_DMA(void)
        }
 }
 
+static void pnv_pci_ioda_create_dbgfs(void)
+{
+#ifdef CONFIG_DEBUG_FS
+       struct pci_controller *hose, *tmp;
+       struct pnv_phb *phb;
+       char name[16];
+
+       list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+               phb = hose->private_data;
+
+               sprintf(name, "PCI%04x", hose->global_number);
+               phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
+               if (!phb->dbgfs)
+                       pr_warning("%s: Error on creating debugfs on PHB#%x\n",
+                               __func__, hose->global_number);
+       }
+#endif /* CONFIG_DEBUG_FS */
+}
+
 static void pnv_pci_ioda_fixup(void)
 {
        pnv_pci_ioda_setup_PEs();
        pnv_pci_ioda_setup_seg();
        pnv_pci_ioda_setup_DMA();
+
+       pnv_pci_ioda_create_dbgfs();
+
+#ifdef CONFIG_EEH
+       eeh_probe_mode_set(EEH_PROBE_MODE_DEV);
+       eeh_addr_cache_build();
+       eeh_init();
+#endif
 }
 
 /*
@@ -1049,7 +1079,8 @@ static void pnv_pci_ioda_shutdown(struct pnv_phb *phb)
                       OPAL_ASSERT_RESET);
 }
 
-void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type)
+void __init pnv_pci_init_ioda_phb(struct device_node *np,
+                                 u64 hub_id, int ioda_type)
 {
        struct pci_controller *hose;
        static int primary = 1;
@@ -1087,6 +1118,7 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type)
        hose->first_busno = 0;
        hose->last_busno = 0xff;
        hose->private_data = phb;
+       phb->hub_id = hub_id;
        phb->opal_id = phb_id;
        phb->type = ioda_type;
 
@@ -1172,6 +1204,9 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type)
                phb->ioda.io_size, phb->ioda.io_segsize);
 
        phb->hose->ops = &pnv_pci_ops;
+#ifdef CONFIG_EEH
+       phb->eeh_ops = &ioda_eeh_ops;
+#endif
 
        /* Setup RID -> PE mapping function */
        phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe;
@@ -1212,7 +1247,7 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type)
 
 void pnv_pci_init_ioda2_phb(struct device_node *np)
 {
-       pnv_pci_init_ioda_phb(np, PNV_PHB_IODA2);
+       pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
 }
 
 void __init pnv_pci_init_ioda_hub(struct device_node *np)
@@ -1235,6 +1270,6 @@ void __init pnv_pci_init_ioda_hub(struct device_node *np)
        for_each_child_of_node(np, phbn) {
                /* Look for IODA1 PHBs */
                if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
-                       pnv_pci_init_ioda_phb(phbn, PNV_PHB_IODA1);
+                       pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1);
        }
 }
index 92b37a0186c93c277443a92e082b3736ceeee1bc..b68db6325c1b2a222f6045d5d7d5813c1c329a1a 100644 (file)
@@ -86,13 +86,16 @@ static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { }
 static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
                                         struct pci_dev *pdev)
 {
-       if (phb->p5ioc2.iommu_table.it_map == NULL)
+       if (phb->p5ioc2.iommu_table.it_map == NULL) {
                iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node);
+               iommu_register_group(&phb->p5ioc2.iommu_table,
+                               pci_domain_nr(phb->hose->bus), phb->opal_id);
+       }
 
        set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table);
 }
 
-static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np,
+static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
                                           void *tce_mem, u64 tce_size)
 {
        struct pnv_phb *phb;
@@ -133,6 +136,7 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np,
        phb->hose->first_busno = 0;
        phb->hose->last_busno = 0xff;
        phb->hose->private_data = phb;
+       phb->hub_id = hub_id;
        phb->opal_id = phb_id;
        phb->type = PNV_PHB_P5IOC2;
        phb->model = PNV_PHB_MODEL_P5IOC2;
@@ -226,7 +230,8 @@ void __init pnv_pci_init_p5ioc2_hub(struct device_node *np)
        for_each_child_of_node(np, phbn) {
                if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") ||
                    of_device_is_compatible(phbn, "ibm,p5ioc2-pciex")) {
-                       pnv_pci_init_p5ioc2_phb(phbn, tce_mem, tce_per_phb);
+                       pnv_pci_init_p5ioc2_phb(phbn, hub_id,
+                                       tce_mem, tce_per_phb);
                        tce_mem += tce_per_phb;
                }
        }
index 277343cc6a3d7f87966408f088a33b61f502ecda..a28d3b5e6393fa8b9d9d11bc81210ab9db13f38b 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -32,6 +33,8 @@
 #include <asm/iommu.h>
 #include <asm/tce.h>
 #include <asm/firmware.h>
+#include <asm/eeh_event.h>
+#include <asm/eeh.h>
 
 #include "powernv.h"
 #include "pci.h"
@@ -202,7 +205,8 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
 
        spin_lock_irqsave(&phb->lock, flags);
 
-       rc = opal_pci_get_phb_diag_data(phb->opal_id, phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE);
+       rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
+                                        PNV_PCI_DIAG_BUF_SIZE);
        has_diag = (rc == OPAL_SUCCESS);
 
        rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
@@ -227,43 +231,50 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
        spin_unlock_irqrestore(&phb->lock, flags);
 }
 
-static void pnv_pci_config_check_eeh(struct pnv_phb *phb, struct pci_bus *bus,
-                                    u32 bdfn)
+static void pnv_pci_config_check_eeh(struct pnv_phb *phb,
+                                    struct device_node *dn)
 {
        s64     rc;
        u8      fstate;
        u16     pcierr;
        u32     pe_no;
 
-       /* Get PE# if we support IODA */
-       pe_no = phb->bdfn_to_pe ? phb->bdfn_to_pe(phb, bus, bdfn & 0xff) : 0;
+       /*
+        * Get the PE#. During the PCI probe stage, we might not
+        * setup that yet. So all ER errors should be mapped to
+        * PE#0
+        */
+       pe_no = PCI_DN(dn)->pe_number;
+       if (pe_no == IODA_INVALID_PE)
+               pe_no = 0;
 
        /* Read freeze status */
        rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, &fstate, &pcierr,
                                        NULL);
        if (rc) {
-               pr_warning("PCI %d: Failed to read EEH status for PE#%d,"
-                          " err %lld\n", phb->hose->global_number, pe_no, rc);
+               pr_warning("%s: Can't read EEH status (PE#%d) for "
+                          "%s, err %lld\n",
+                          __func__, pe_no, dn->full_name, rc);
                return;
        }
-       cfg_dbg(" -> EEH check, bdfn=%04x PE%d fstate=%x\n",
-               bdfn, pe_no, fstate);
+       cfg_dbg(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n",
+               (PCI_DN(dn)->busno << 8) | (PCI_DN(dn)->devfn),
+               pe_no, fstate);
        if (fstate != 0)
                pnv_pci_handle_eeh_config(phb, pe_no);
 }
 
-static int pnv_pci_read_config(struct pci_bus *bus,
-                              unsigned int devfn,
-                              int where, int size, u32 *val)
+int pnv_pci_cfg_read(struct device_node *dn,
+                    int where, int size, u32 *val)
 {
-       struct pci_controller *hose = pci_bus_to_host(bus);
-       struct pnv_phb *phb = hose->private_data;
-       u32 bdfn = (((uint64_t)bus->number) << 8) | devfn;
+       struct pci_dn *pdn = PCI_DN(dn);
+       struct pnv_phb *phb = pdn->phb->private_data;
+       u32 bdfn = (pdn->busno << 8) | pdn->devfn;
+#ifdef CONFIG_EEH
+       struct eeh_pe *phb_pe = NULL;
+#endif
        s64 rc;
 
-       if (hose == NULL)
-               return PCIBIOS_DEVICE_NOT_FOUND;
-
        switch (size) {
        case 1: {
                u8 v8;
@@ -287,28 +298,43 @@ static int pnv_pci_read_config(struct pci_bus *bus,
        default:
                return PCIBIOS_FUNC_NOT_SUPPORTED;
        }
-       cfg_dbg("pnv_pci_read_config bus: %x devfn: %x +%x/%x -> %08x\n",
-               bus->number, devfn, where, size, *val);
-
-       /* Check if the PHB got frozen due to an error (no response) */
-       pnv_pci_config_check_eeh(phb, bus, bdfn);
+       cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
+               __func__, pdn->busno, pdn->devfn, where, size, *val);
+
+       /*
+        * Check if the specified PE has been put into frozen
+        * state. On the other hand, we needn't do that while
+        * the PHB has been put into frozen state because of
+        * PHB-fatal errors.
+        */
+#ifdef CONFIG_EEH
+       phb_pe = eeh_phb_pe_get(pdn->phb);
+       if (phb_pe && (phb_pe->state & EEH_PE_ISOLATED))
+               return PCIBIOS_SUCCESSFUL;
+
+       if (phb->eeh_state & PNV_EEH_STATE_ENABLED) {
+               if (*val == EEH_IO_ERROR_VALUE(size) &&
+                   eeh_dev_check_failure(of_node_to_eeh_dev(dn)))
+                       return PCIBIOS_DEVICE_NOT_FOUND;
+       } else {
+               pnv_pci_config_check_eeh(phb, dn);
+       }
+#else
+       pnv_pci_config_check_eeh(phb, dn);
+#endif
 
        return PCIBIOS_SUCCESSFUL;
 }
 
-static int pnv_pci_write_config(struct pci_bus *bus,
-                               unsigned int devfn,
-                               int where, int size, u32 val)
+int pnv_pci_cfg_write(struct device_node *dn,
+                     int where, int size, u32 val)
 {
-       struct pci_controller *hose = pci_bus_to_host(bus);
-       struct pnv_phb *phb = hose->private_data;
-       u32 bdfn = (((uint64_t)bus->number) << 8) | devfn;
-
-       if (hose == NULL)
-               return PCIBIOS_DEVICE_NOT_FOUND;
+       struct pci_dn *pdn = PCI_DN(dn);
+       struct pnv_phb *phb = pdn->phb->private_data;
+       u32 bdfn = (pdn->busno << 8) | pdn->devfn;
 
-       cfg_dbg("pnv_pci_write_config bus: %x devfn: %x +%x/%x -> %08x\n",
-               bus->number, devfn, where, size, val);
+       cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
+               pdn->busno, pdn->devfn, where, size, val);
        switch (size) {
        case 1:
                opal_pci_config_write_byte(phb->opal_id, bdfn, where, val);
@@ -322,14 +348,54 @@ static int pnv_pci_write_config(struct pci_bus *bus,
        default:
                return PCIBIOS_FUNC_NOT_SUPPORTED;
        }
+
        /* Check if the PHB got frozen due to an error (no response) */
-       pnv_pci_config_check_eeh(phb, bus, bdfn);
+#ifdef CONFIG_EEH
+       if (!(phb->eeh_state & PNV_EEH_STATE_ENABLED))
+               pnv_pci_config_check_eeh(phb, dn);
+#else
+       pnv_pci_config_check_eeh(phb, dn);
+#endif
 
        return PCIBIOS_SUCCESSFUL;
 }
 
+static int pnv_pci_read_config(struct pci_bus *bus,
+                              unsigned int devfn,
+                              int where, int size, u32 *val)
+{
+       struct device_node *dn, *busdn = pci_bus_to_OF_node(bus);
+       struct pci_dn *pdn;
+
+       for (dn = busdn->child; dn; dn = dn->sibling) {
+               pdn = PCI_DN(dn);
+               if (pdn && pdn->devfn == devfn)
+                       return pnv_pci_cfg_read(dn, where, size, val);
+       }
+
+       *val = 0xFFFFFFFF;
+       return PCIBIOS_DEVICE_NOT_FOUND;
+
+}
+
+static int pnv_pci_write_config(struct pci_bus *bus,
+                               unsigned int devfn,
+                               int where, int size, u32 val)
+{
+       struct device_node *dn, *busdn = pci_bus_to_OF_node(bus);
+       struct pci_dn *pdn;
+
+       for (dn = busdn->child; dn; dn = dn->sibling) {
+               pdn = PCI_DN(dn);
+               if (pdn && pdn->devfn == devfn)
+                       return pnv_pci_cfg_write(dn, where, size, val);
+       }
+
+       return PCIBIOS_DEVICE_NOT_FOUND;
+}
+
 struct pci_ops pnv_pci_ops = {
-       .read = pnv_pci_read_config,
+       .read  = pnv_pci_read_config,
        .write = pnv_pci_write_config,
 };
 
@@ -412,6 +478,7 @@ static struct iommu_table *pnv_pci_setup_bml_iommu(struct pci_controller *hose)
        pnv_pci_setup_iommu_table(tbl, __va(be64_to_cpup(basep)),
                                  be32_to_cpup(sizep), 0);
        iommu_init_table(tbl, hose->node);
+       iommu_register_group(tbl, pci_domain_nr(hose->bus), 0);
 
        /* Deal with SW invalidated TCEs when needed (BML way) */
        swinvp = of_get_property(hose->dn, "linux,tce-sw-invalidate-info",
index 25d76c4df50b27711c6bd1bb40481216a7009eb6..d633c64e05a1ef9204924b48a132bb505cd5e0a8 100644 (file)
@@ -66,15 +66,43 @@ struct pnv_ioda_pe {
        struct list_head        list;
 };
 
+/* IOC dependent EEH operations */
+#ifdef CONFIG_EEH
+struct pnv_eeh_ops {
+       int (*post_init)(struct pci_controller *hose);
+       int (*set_option)(struct eeh_pe *pe, int option);
+       int (*get_state)(struct eeh_pe *pe);
+       int (*reset)(struct eeh_pe *pe, int option);
+       int (*get_log)(struct eeh_pe *pe, int severity,
+                      char *drv_log, unsigned long len);
+       int (*configure_bridge)(struct eeh_pe *pe);
+       int (*next_error)(struct eeh_pe **pe);
+};
+
+#define PNV_EEH_STATE_ENABLED  (1 << 0)        /* EEH enabled  */
+#define PNV_EEH_STATE_REMOVED  (1 << 1)        /* PHB removed  */
+
+#endif /* CONFIG_EEH */
+
 struct pnv_phb {
        struct pci_controller   *hose;
        enum pnv_phb_type       type;
        enum pnv_phb_model      model;
+       u64                     hub_id;
        u64                     opal_id;
        void __iomem            *regs;
        int                     initialized;
        spinlock_t              lock;
 
+#ifdef CONFIG_EEH
+       struct pnv_eeh_ops      *eeh_ops;
+       int                     eeh_state;
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+       struct dentry           *dbgfs;
+#endif
+
 #ifdef CONFIG_PCI_MSI
        unsigned int            msi_base;
        unsigned int            msi32_support;
@@ -150,7 +178,14 @@ struct pnv_phb {
 };
 
 extern struct pci_ops pnv_pci_ops;
+#ifdef CONFIG_EEH
+extern struct pnv_eeh_ops ioda_eeh_ops;
+#endif
 
+int pnv_pci_cfg_read(struct device_node *dn,
+                    int where, int size, u32 *val);
+int pnv_pci_cfg_write(struct device_node *dn,
+                     int where, int size, u32 val);
 extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
                                      void *tce_mem, u64 tce_size,
                                      u64 dma_offset);
index d4459bfc92f76a7bd45c6714395023786f200cc9..84438af96c052b7e465d6b6a03d333ba9e7b37ba 100644 (file)
@@ -93,6 +93,8 @@ static void  __noreturn pnv_restart(char *cmd)
 {
        long rc = OPAL_BUSY;
 
+       opal_notifier_disable();
+
        while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
                rc = opal_cec_reboot();
                if (rc == OPAL_BUSY_EVENT)
@@ -108,6 +110,8 @@ static void __noreturn pnv_power_off(void)
 {
        long rc = OPAL_BUSY;
 
+       opal_notifier_disable();
+
        while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
                rc = opal_cec_power_down(0);
                if (rc == OPAL_BUSY_EVENT)
index 88c9459c3e07121a64765e4304bcbb4dd7a1b2a4..89e3857af4e0913679448c5cc78c6549d17b29e9 100644 (file)
@@ -40,7 +40,7 @@
 #define DBG(fmt...)
 #endif
 
-static void __cpuinit pnv_smp_setup_cpu(int cpu)
+static void pnv_smp_setup_cpu(int cpu)
 {
        if (cpu != boot_cpuid)
                xics_setup_cpu();
@@ -51,7 +51,7 @@ static int pnv_smp_cpu_bootable(unsigned int nr)
        /* Special case - we inhibit secondary thread startup
         * during boot if the user requests it.
         */
-       if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) {
+       if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) {
                if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
                        return 0;
                if (smt_enabled_at_boot
index 177a2f70700c626c07f6cd7b365d2845728d5957..3e270e3412ae69fed007e68cd9fd008b20eb24b9 100644 (file)
@@ -109,7 +109,8 @@ static long ps3_hpte_remove(unsigned long hpte_group)
 }
 
 static long ps3_hpte_updatepp(unsigned long slot, unsigned long newpp,
-       unsigned long vpn, int psize, int ssize, int local)
+                             unsigned long vpn, int psize, int apsize,
+                             int ssize, int local)
 {
        int result;
        u64 hpte_v, want_v, hpte_rs;
@@ -162,7 +163,7 @@ static void ps3_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
 }
 
 static void ps3_hpte_invalidate(unsigned long slot, unsigned long vpn,
-       int psize, int ssize, int local)
+                               int psize, int apsize, int ssize, int local)
 {
        unsigned long flags;
        int result;
index 4459eff7a75ad6b3f4135f015591e2b9575dcec4..1bd3399146ed6f945f24308ea95883bffdce8d28 100644 (file)
@@ -33,11 +33,6 @@ config PPC_SPLPAR
          processors, that is, which share physical processors between
          two or more partitions.
 
-config EEH
-       bool
-       depends on PPC_PSERIES && PCI
-       default y
-
 config PSERIES_MSI
        bool
        depends on PCI_MSI && EEH
index 53866e537a92d360994c382fa59f32ea18fd87a4..8ae010381316d961ba8bb836b54cfb650c9367f3 100644 (file)
@@ -6,9 +6,7 @@ obj-y                   := lpar.o hvCall.o nvram.o reconfig.o \
                           firmware.o power.o dlpar.o mobility.o
 obj-$(CONFIG_SMP)      += smp.o
 obj-$(CONFIG_SCANLOG)  += scanlog.o
-obj-$(CONFIG_EEH)      += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \
-                          eeh_driver.o eeh_event.o eeh_sysfs.o \
-                          eeh_pseries.o
+obj-$(CONFIG_EEH)      += eeh_pseries.o
 obj-$(CONFIG_KEXEC)    += kexec.o
 obj-$(CONFIG_PCI)      += pci.o pci_dlpar.o
 obj-$(CONFIG_PSERIES_MSI)      += msi.o
index ef9d9d84c7d5d7ee3b1c1220c1d2d080e2dd1f98..5ea88d1541f74c1aaf6c1b01cd00045ebd61badf 100644 (file)
@@ -115,7 +115,7 @@ static struct pseries_io_event * ioei_find_event(struct rtas_error_log *elog)
  *   by scope or event type alone. For example, Torrent ISR route change
  *   event is reported with scope 0x00 (Not Applicatable) rather than
  *   0x3B (Torrent-hub). It is better to let the clients to identify
- *   who owns the the event.
+ *   who owns the event.
  */
 
 static irqreturn_t ioei_interrupt(int irq, void *dev_id)
index 86ae364900d60cbde3000d44481e248e9895609a..23fc1dcf44344543a9e3336b61e5f0d9f2bf6201 100644 (file)
@@ -614,6 +614,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 
        iommu_table_setparms(pci->phb, dn, tbl);
        pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
+       iommu_register_group(tbl, pci_domain_nr(bus), 0);
 
        /* Divide the rest (1.75GB) among the children */
        pci->phb->dma_window_size = 0x80000000ul;
@@ -658,6 +659,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
                                   ppci->phb->node);
                iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
                ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node);
+               iommu_register_group(tbl, pci_domain_nr(bus), 0);
                pr_debug("  created table: %p\n", ppci->iommu_table);
        }
 }
@@ -684,6 +686,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
                                   phb->node);
                iommu_table_setparms(phb, dn, tbl);
                PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
+               iommu_register_group(tbl, pci_domain_nr(phb->bus), 0);
                set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);
                return;
        }
@@ -1184,6 +1187,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
                                   pci->phb->node);
                iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
                pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
+               iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0);
                pr_debug("  created table: %p\n", pci->iommu_table);
        } else {
                pr_debug("  found DMA window, table: %p\n", pci->iommu_table);
index 6d62072a7d5a27162c695a9efe5a3d12c11213c1..02d6e21619bb6c677771c0dc6ecaabb5e1d417d5 100644 (file)
 #include "plpar_wrappers.h"
 #include "pseries.h"
 
+/* Flag bits for H_BULK_REMOVE */
+#define HBR_REQUEST    0x4000000000000000UL
+#define HBR_RESPONSE   0x8000000000000000UL
+#define HBR_END                0xc000000000000000UL
+#define HBR_AVPN       0x0200000000000000UL
+#define HBR_ANDCOND    0x0100000000000000UL
+
 
 /* in hvCall.S */
 EXPORT_SYMBOL(plpar_hcall);
@@ -64,6 +71,9 @@ void vpa_init(int cpu)
        if (cpu_has_feature(CPU_FTR_ALTIVEC))
                lppaca_of(cpu).vmxregs_in_use = 1;
 
+       if (cpu_has_feature(CPU_FTR_ARCH_207S))
+               lppaca_of(cpu).ebb_regs_in_use = 1;
+
        addr = __pa(&lppaca_of(cpu));
        ret = register_vpa(hwcpu, addr);
 
@@ -240,7 +250,8 @@ static void pSeries_lpar_hptab_clear(void)
 static long pSeries_lpar_hpte_updatepp(unsigned long slot,
                                       unsigned long newpp,
                                       unsigned long vpn,
-                                      int psize, int ssize, int local)
+                                      int psize, int apsize,
+                                      int ssize, int local)
 {
        unsigned long lpar_rc;
        unsigned long flags = (newpp & 7) | H_AVPN;
@@ -328,7 +339,8 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
 }
 
 static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
-                                        int psize, int ssize, int local)
+                                        int psize, int apsize,
+                                        int ssize, int local)
 {
        unsigned long want_v;
        unsigned long lpar_rc;
@@ -345,6 +357,113 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
        BUG_ON(lpar_rc != H_SUCCESS);
 }
 
+/*
+ * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need
+ * to make sure that we avoid bouncing the hypervisor tlbie lock.
+ */
+#define PPC64_HUGE_HPTE_BATCH 12
+
+static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
+                                            unsigned long *vpn, int count,
+                                            int psize, int ssize)
+{
+       unsigned long param[8];
+       int i = 0, pix = 0, rc;
+       unsigned long flags = 0;
+       int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+
+       if (lock_tlbie)
+               spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
+
+       for (i = 0; i < count; i++) {
+
+               if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
+                       pSeries_lpar_hpte_invalidate(slot[i], vpn[i], psize, 0,
+                                                    ssize, 0);
+               } else {
+                       param[pix] = HBR_REQUEST | HBR_AVPN | slot[i];
+                       param[pix+1] = hpte_encode_avpn(vpn[i], psize, ssize);
+                       pix += 2;
+                       if (pix == 8) {
+                               rc = plpar_hcall9(H_BULK_REMOVE, param,
+                                                 param[0], param[1], param[2],
+                                                 param[3], param[4], param[5],
+                                                 param[6], param[7]);
+                               BUG_ON(rc != H_SUCCESS);
+                               pix = 0;
+                       }
+               }
+       }
+       if (pix) {
+               param[pix] = HBR_END;
+               rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
+                                 param[2], param[3], param[4], param[5],
+                                 param[6], param[7]);
+               BUG_ON(rc != H_SUCCESS);
+       }
+
+       if (lock_tlbie)
+               spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
+}
+
+static void pSeries_lpar_hugepage_invalidate(struct mm_struct *mm,
+                                      unsigned char *hpte_slot_array,
+                                      unsigned long addr, int psize)
+{
+       int ssize = 0, i, index = 0;
+       unsigned long s_addr = addr;
+       unsigned int max_hpte_count, valid;
+       unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
+       unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
+       unsigned long shift, hidx, vpn = 0, vsid, hash, slot;
+
+       shift = mmu_psize_defs[psize].shift;
+       max_hpte_count = 1U << (PMD_SHIFT - shift);
+
+       for (i = 0; i < max_hpte_count; i++) {
+               valid = hpte_valid(hpte_slot_array, i);
+               if (!valid)
+                       continue;
+               hidx =  hpte_hash_index(hpte_slot_array, i);
+
+               /* get the vpn */
+               addr = s_addr + (i * (1ul << shift));
+               if (!is_kernel_addr(addr)) {
+                       ssize = user_segment_size(addr);
+                       vsid = get_vsid(mm->context.id, addr, ssize);
+                       WARN_ON(vsid == 0);
+               } else {
+                       vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+                       ssize = mmu_kernel_ssize;
+               }
+
+               vpn = hpt_vpn(addr, vsid, ssize);
+               hash = hpt_hash(vpn, shift, ssize);
+               if (hidx & _PTEIDX_SECONDARY)
+                       hash = ~hash;
+
+               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+               slot += hidx & _PTEIDX_GROUP_IX;
+
+               slot_array[index] = slot;
+               vpn_array[index] = vpn;
+               if (index == PPC64_HUGE_HPTE_BATCH - 1) {
+                       /*
+                        * Now do a bluk invalidate
+                        */
+                       __pSeries_lpar_hugepage_invalidate(slot_array,
+                                                          vpn_array,
+                                                          PPC64_HUGE_HPTE_BATCH,
+                                                          psize, ssize);
+                       index = 0;
+               } else
+                       index++;
+       }
+       if (index)
+               __pSeries_lpar_hugepage_invalidate(slot_array, vpn_array,
+                                                  index, psize, ssize);
+}
+
 static void pSeries_lpar_hpte_removebolted(unsigned long ea,
                                           int psize, int ssize)
 {
@@ -356,17 +475,12 @@ static void pSeries_lpar_hpte_removebolted(unsigned long ea,
 
        slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
        BUG_ON(slot == -1);
-
-       pSeries_lpar_hpte_invalidate(slot, vpn, psize, ssize, 0);
+       /*
+        * lpar doesn't use the passed actual page size
+        */
+       pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0);
 }
 
-/* Flag bits for H_BULK_REMOVE */
-#define HBR_REQUEST    0x4000000000000000UL
-#define HBR_RESPONSE   0x8000000000000000UL
-#define HBR_END                0xc000000000000000UL
-#define HBR_AVPN       0x0200000000000000UL
-#define HBR_ANDCOND    0x0100000000000000UL
-
 /*
  * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
  * lock.
@@ -400,8 +514,11 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
                        slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
                        slot += hidx & _PTEIDX_GROUP_IX;
                        if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
+                               /*
+                                * lpar doesn't use the passed actual page size
+                                */
                                pSeries_lpar_hpte_invalidate(slot, vpn, psize,
-                                                            ssize, local);
+                                                            0, ssize, local);
                        } else {
                                param[pix] = HBR_REQUEST | HBR_AVPN | slot;
                                param[pix+1] = hpte_encode_avpn(vpn, psize,
@@ -452,6 +569,7 @@ void __init hpte_init_lpar(void)
        ppc_md.hpte_removebolted = pSeries_lpar_hpte_removebolted;
        ppc_md.flush_hash_range = pSeries_lpar_flush_hash_range;
        ppc_md.hpte_clear_all   = pSeries_lpar_hptab_clear;
+       ppc_md.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
 }
 
 #ifdef CONFIG_PPC_SMLPAR
index 8733a86ad52ed6dbb2b0fc9030b08852a3ca2b62..14cc486709f6caea23497f0309eb70f1e90e0b6f 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/kmsg_dump.h>
+#include <linux/pstore.h>
 #include <linux/ctype.h>
 #include <linux/zlib.h>
 #include <asm/uaccess.h>
 /* Max bytes to read/write in one go */
 #define NVRW_CNT 0x20
 
+/*
+ * Set oops header version to distingush between old and new format header.
+ * lnx,oops-log partition max size is 4000, header version > 4000 will
+ * help in identifying new header.
+ */
+#define OOPS_HDR_VERSION 5000
+
 static unsigned int nvram_size;
 static int nvram_fetch, nvram_store;
 static char nvram_buf[NVRW_CNT];       /* assume this is in the first 4GB */
@@ -45,20 +53,23 @@ struct nvram_os_partition {
        int min_size;   /* minimum acceptable size (0 means req_size) */
        long size;      /* size of data portion (excluding err_log_info) */
        long index;     /* offset of data portion of partition */
+       bool os_partition; /* partition initialized by OS, not FW */
 };
 
 static struct nvram_os_partition rtas_log_partition = {
        .name = "ibm,rtas-log",
        .req_size = 2079,
        .min_size = 1055,
-       .index = -1
+       .index = -1,
+       .os_partition = true
 };
 
 static struct nvram_os_partition oops_log_partition = {
        .name = "lnx,oops-log",
        .req_size = 4000,
        .min_size = 2000,
-       .index = -1
+       .index = -1,
+       .os_partition = true
 };
 
 static const char *pseries_nvram_os_partitions[] = {
@@ -67,6 +78,12 @@ static const char *pseries_nvram_os_partitions[] = {
        NULL
 };
 
+struct oops_log_info {
+       u16 version;
+       u16 report_length;
+       u64 timestamp;
+} __attribute__((packed));
+
 static void oops_to_nvram(struct kmsg_dumper *dumper,
                          enum kmsg_dump_reason reason);
 
@@ -83,28 +100,28 @@ static unsigned long last_unread_rtas_event;       /* timestamp */
 
  * big_oops_buf[] holds the uncompressed text we're capturing.
  *
- * oops_buf[] holds the compressed text, preceded by a prefix.
- * The prefix is just a u16 holding the length of the compressed* text.
- * (*Or uncompressed, if compression fails.)  oops_buf[] gets written
- * to NVRAM.
+ * oops_buf[] holds the compressed text, preceded by a oops header.
+ * oops header has u16 holding the version of oops header (to differentiate
+ * between old and new format header) followed by u16 holding the length of
+ * the compressed* text (*Or uncompressed, if compression fails.) and u64
+ * holding the timestamp. oops_buf[] gets written to NVRAM.
  *
- * oops_len points to the prefix.  oops_data points to the compressed text.
+ * oops_log_info points to the header. oops_data points to the compressed text.
  *
  * +- oops_buf
- * |           +- oops_data
- * v           v
- * +------------+-----------------------------------------------+
- * | length    | text                                          |
- * | (2 bytes) | (oops_data_sz bytes)                          |
- * +------------+-----------------------------------------------+
+ * |                                   +- oops_data
+ * v                                   v
+ * +-----------+-----------+-----------+------------------------+
+ * | version   | length    | timestamp | text                   |
+ * | (2 bytes) | (2 bytes) | (8 bytes) | (oops_data_sz bytes)   |
+ * +-----------+-----------+-----------+------------------------+
  * ^
- * +- oops_len
+ * +- oops_log_info
  *
  * We preallocate these buffers during init to avoid kmalloc during oops/panic.
  */
 static size_t big_oops_buf_sz;
 static char *big_oops_buf, *oops_buf;
-static u16 *oops_len;
 static char *oops_data;
 static size_t oops_data_sz;
 
@@ -114,6 +131,30 @@ static size_t oops_data_sz;
 #define MEM_LEVEL 4
 static struct z_stream_s stream;
 
+#ifdef CONFIG_PSTORE
+static struct nvram_os_partition of_config_partition = {
+       .name = "of-config",
+       .index = -1,
+       .os_partition = false
+};
+
+static struct nvram_os_partition common_partition = {
+       .name = "common",
+       .index = -1,
+       .os_partition = false
+};
+
+static enum pstore_type_id nvram_type_ids[] = {
+       PSTORE_TYPE_DMESG,
+       PSTORE_TYPE_PPC_RTAS,
+       PSTORE_TYPE_PPC_OF,
+       PSTORE_TYPE_PPC_COMMON,
+       -1
+};
+static int read_type;
+static unsigned long last_rtas_event;
+#endif
+
 static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index)
 {
        unsigned int i;
@@ -275,48 +316,72 @@ int nvram_write_error_log(char * buff, int length,
 {
        int rc = nvram_write_os_partition(&rtas_log_partition, buff, length,
                                                err_type, error_log_cnt);
-       if (!rc)
+       if (!rc) {
                last_unread_rtas_event = get_seconds();
+#ifdef CONFIG_PSTORE
+               last_rtas_event = get_seconds();
+#endif
+       }
+
        return rc;
 }
 
-/* nvram_read_error_log
+/* nvram_read_partition
  *
- * Reads nvram for error log for at most 'length'
+ * Reads nvram partition for at most 'length'
  */
-int nvram_read_error_log(char * buff, int length,
-                         unsigned int * err_type, unsigned int * error_log_cnt)
+int nvram_read_partition(struct nvram_os_partition *part, char *buff,
+                       int length, unsigned int *err_type,
+                       unsigned int *error_log_cnt)
 {
        int rc;
        loff_t tmp_index;
        struct err_log_info info;
        
-       if (rtas_log_partition.index == -1)
+       if (part->index == -1)
                return -1;
 
-       if (length > rtas_log_partition.size)
-               length = rtas_log_partition.size;
+       if (length > part->size)
+               length = part->size;
 
-       tmp_index = rtas_log_partition.index;
+       tmp_index = part->index;
 
-       rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index);
-       if (rc <= 0) {
-               printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc);
-               return rc;
+       if (part->os_partition) {
+               rc = ppc_md.nvram_read((char *)&info,
+                                       sizeof(struct err_log_info),
+                                       &tmp_index);
+               if (rc <= 0) {
+                       pr_err("%s: Failed nvram_read (%d)\n", __FUNCTION__,
+                                                                       rc);
+                       return rc;
+               }
        }
 
        rc = ppc_md.nvram_read(buff, length, &tmp_index);
        if (rc <= 0) {
-               printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc);
+               pr_err("%s: Failed nvram_read (%d)\n", __FUNCTION__, rc);
                return rc;
        }
 
-       *error_log_cnt = info.seq_num;
-       *err_type = info.error_type;
+       if (part->os_partition) {
+               *error_log_cnt = info.seq_num;
+               *err_type = info.error_type;
+       }
 
        return 0;
 }
 
+/* nvram_read_error_log
+ *
+ * Reads nvram for error log for at most 'length'
+ */
+int nvram_read_error_log(char *buff, int length,
+                       unsigned int *err_type, unsigned int *error_log_cnt)
+{
+       return nvram_read_partition(&rtas_log_partition, buff, length,
+                                               err_type, error_log_cnt);
+}
+
 /* This doesn't actually zero anything, but it sets the event_logged
  * word to tell that this event is safely in syslog.
  */
@@ -405,6 +470,192 @@ static int __init pseries_nvram_init_os_partition(struct nvram_os_partition
        return 0;
 }
 
+/*
+ * Are we using the ibm,rtas-log for oops/panic reports?  And if so,
+ * would logging this oops/panic overwrite an RTAS event that rtas_errd
+ * hasn't had a chance to read and process?  Return 1 if so, else 0.
+ *
+ * We assume that if rtas_errd hasn't read the RTAS event in
+ * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to.
+ */
+static int clobbering_unread_rtas_event(void)
+{
+       return (oops_log_partition.index == rtas_log_partition.index
+               && last_unread_rtas_event
+               && get_seconds() - last_unread_rtas_event <=
+                                               NVRAM_RTAS_READ_TIMEOUT);
+}
+
+#ifdef CONFIG_PSTORE
+static int nvram_pstore_open(struct pstore_info *psi)
+{
+       /* Reset the iterator to start reading partitions again */
+       read_type = -1;
+       return 0;
+}
+
+/**
+ * nvram_pstore_write - pstore write callback for nvram
+ * @type:               Type of message logged
+ * @reason:             reason behind dump (oops/panic)
+ * @id:                 identifier to indicate the write performed
+ * @part:               pstore writes data to registered buffer in parts,
+ *                      part number will indicate the same.
+ * @count:              Indicates oops count
+ * @size:               number of bytes written to the registered buffer
+ * @psi:                registered pstore_info structure
+ *
+ * Called by pstore_dump() when an oops or panic report is logged in the
+ * printk buffer.
+ * Returns 0 on successful write.
+ */
+static int nvram_pstore_write(enum pstore_type_id type,
+                               enum kmsg_dump_reason reason,
+                               u64 *id, unsigned int part, int count,
+                               size_t size, struct pstore_info *psi)
+{
+       int rc;
+       struct oops_log_info *oops_hdr = (struct oops_log_info *) oops_buf;
+
+       /* part 1 has the recent messages from printk buffer */
+       if (part > 1 || type != PSTORE_TYPE_DMESG ||
+                               clobbering_unread_rtas_event())
+               return -1;
+
+       oops_hdr->version = OOPS_HDR_VERSION;
+       oops_hdr->report_length = (u16) size;
+       oops_hdr->timestamp = get_seconds();
+       rc = nvram_write_os_partition(&oops_log_partition, oops_buf,
+               (int) (sizeof(*oops_hdr) + size), ERR_TYPE_KERNEL_PANIC,
+               count);
+
+       if (rc != 0)
+               return rc;
+
+       *id = part;
+       return 0;
+}
+
+/*
+ * Reads the oops/panic report, rtas, of-config and common partition.
+ * Returns the length of the data we read from each partition.
+ * Returns 0 if we've been called before.
+ */
+static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
+                               int *count, struct timespec *time, char **buf,
+                               struct pstore_info *psi)
+{
+       struct oops_log_info *oops_hdr;
+       unsigned int err_type, id_no, size = 0;
+       struct nvram_os_partition *part = NULL;
+       char *buff = NULL;
+       int sig = 0;
+       loff_t p;
+
+       read_type++;
+
+       switch (nvram_type_ids[read_type]) {
+       case PSTORE_TYPE_DMESG:
+               part = &oops_log_partition;
+               *type = PSTORE_TYPE_DMESG;
+               break;
+       case PSTORE_TYPE_PPC_RTAS:
+               part = &rtas_log_partition;
+               *type = PSTORE_TYPE_PPC_RTAS;
+               time->tv_sec = last_rtas_event;
+               time->tv_nsec = 0;
+               break;
+       case PSTORE_TYPE_PPC_OF:
+               sig = NVRAM_SIG_OF;
+               part = &of_config_partition;
+               *type = PSTORE_TYPE_PPC_OF;
+               *id = PSTORE_TYPE_PPC_OF;
+               time->tv_sec = 0;
+               time->tv_nsec = 0;
+               break;
+       case PSTORE_TYPE_PPC_COMMON:
+               sig = NVRAM_SIG_SYS;
+               part = &common_partition;
+               *type = PSTORE_TYPE_PPC_COMMON;
+               *id = PSTORE_TYPE_PPC_COMMON;
+               time->tv_sec = 0;
+               time->tv_nsec = 0;
+               break;
+       default:
+               return 0;
+       }
+
+       if (!part->os_partition) {
+               p = nvram_find_partition(part->name, sig, &size);
+               if (p <= 0) {
+                       pr_err("nvram: Failed to find partition %s, "
+                               "err %d\n", part->name, (int)p);
+                       return 0;
+               }
+               part->index = p;
+               part->size = size;
+       }
+
+       buff = kmalloc(part->size, GFP_KERNEL);
+
+       if (!buff)
+               return -ENOMEM;
+
+       if (nvram_read_partition(part, buff, part->size, &err_type, &id_no)) {
+               kfree(buff);
+               return 0;
+       }
+
+       *count = 0;
+
+       if (part->os_partition)
+               *id = id_no;
+
+       if (nvram_type_ids[read_type] == PSTORE_TYPE_DMESG) {
+               oops_hdr = (struct oops_log_info *)buff;
+               *buf = buff + sizeof(*oops_hdr);
+               time->tv_sec = oops_hdr->timestamp;
+               time->tv_nsec = 0;
+               return oops_hdr->report_length;
+       }
+
+       *buf = buff;
+       return part->size;
+}
+
+static struct pstore_info nvram_pstore_info = {
+       .owner = THIS_MODULE,
+       .name = "nvram",
+       .open = nvram_pstore_open,
+       .read = nvram_pstore_read,
+       .write = nvram_pstore_write,
+};
+
+static int nvram_pstore_init(void)
+{
+       int rc = 0;
+
+       nvram_pstore_info.buf = oops_data;
+       nvram_pstore_info.bufsize = oops_data_sz;
+
+       rc = pstore_register(&nvram_pstore_info);
+       if (rc != 0)
+               pr_err("nvram: pstore_register() failed, defaults to "
+                               "kmsg_dump; returned %d\n", rc);
+       else
+               /*TODO: Support compression when pstore is configured */
+               pr_info("nvram: Compression of oops text supported only when "
+                               "pstore is not configured");
+
+       return rc;
+}
+#else
+static int nvram_pstore_init(void)
+{
+       return -1;
+}
+#endif
+
 static void __init nvram_init_oops_partition(int rtas_partition_exists)
 {
        int rc;
@@ -425,9 +676,13 @@ static void __init nvram_init_oops_partition(int rtas_partition_exists)
                                                oops_log_partition.name);
                return;
        }
-       oops_len = (u16*) oops_buf;
-       oops_data = oops_buf + sizeof(u16);
-       oops_data_sz = oops_log_partition.size - sizeof(u16);
+       oops_data = oops_buf + sizeof(struct oops_log_info);
+       oops_data_sz = oops_log_partition.size - sizeof(struct oops_log_info);
+
+       rc = nvram_pstore_init();
+
+       if (!rc)
+               return;
 
        /*
         * Figure compression (preceded by elimination of each line's <n>
@@ -501,21 +756,6 @@ int __init pSeries_nvram_init(void)
        return 0;
 }
 
-/*
- * Are we using the ibm,rtas-log for oops/panic reports?  And if so,
- * would logging this oops/panic overwrite an RTAS event that rtas_errd
- * hasn't had a chance to read and process?  Return 1 if so, else 0.
- *
- * We assume that if rtas_errd hasn't read the RTAS event in
- * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to.
- */
-static int clobbering_unread_rtas_event(void)
-{
-       return (oops_log_partition.index == rtas_log_partition.index
-               && last_unread_rtas_event
-               && get_seconds() - last_unread_rtas_event <=
-                                               NVRAM_RTAS_READ_TIMEOUT);
-}
 
 /* Derived from logfs_compress() */
 static int nvram_compress(const void *in, void *out, size_t inlen,
@@ -555,6 +795,7 @@ error:
 /* Compress the text from big_oops_buf into oops_buf. */
 static int zip_oops(size_t text_len)
 {
+       struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
        int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len,
                                                                oops_data_sz);
        if (zipped_len < 0) {
@@ -562,7 +803,9 @@ static int zip_oops(size_t text_len)
                pr_err("nvram: logging uncompressed oops/panic report\n");
                return -1;
        }
-       *oops_len = (u16) zipped_len;
+       oops_hdr->version = OOPS_HDR_VERSION;
+       oops_hdr->report_length = (u16) zipped_len;
+       oops_hdr->timestamp = get_seconds();
        return 0;
 }
 
@@ -576,6 +819,7 @@ static int zip_oops(size_t text_len)
 static void oops_to_nvram(struct kmsg_dumper *dumper,
                          enum kmsg_dump_reason reason)
 {
+       struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
        static unsigned int oops_count = 0;
        static bool panicking = false;
        static DEFINE_SPINLOCK(lock);
@@ -619,14 +863,17 @@ static void oops_to_nvram(struct kmsg_dumper *dumper,
        }
        if (rc != 0) {
                kmsg_dump_rewind(dumper);
-               kmsg_dump_get_buffer(dumper, true,
+               kmsg_dump_get_buffer(dumper, false,
                                     oops_data, oops_data_sz, &text_len);
                err_type = ERR_TYPE_KERNEL_PANIC;
-               *oops_len = (u16) text_len;
+               oops_hdr->version = OOPS_HDR_VERSION;
+               oops_hdr->report_length = (u16) text_len;
+               oops_hdr->timestamp = get_seconds();
        }
 
        (void) nvram_write_os_partition(&oops_log_partition, oops_buf,
-               (int) (sizeof(*oops_len) + *oops_len), err_type, ++oops_count);
+               (int) (sizeof(*oops_hdr) + oops_hdr->report_length), err_type,
+               ++oops_count);
 
        spin_unlock_irqrestore(&lock, flags);
 }
index c91b22be92889bb742569d500db73373eb5331a3..efe61374f6eae3eda2447a351787958a2f562cf8 100644 (file)
@@ -64,91 +64,6 @@ pcibios_find_pci_bus(struct device_node *dn)
 }
 EXPORT_SYMBOL_GPL(pcibios_find_pci_bus);
 
-/**
- * __pcibios_remove_pci_devices - remove all devices under this bus
- * @bus: the indicated PCI bus
- * @purge_pe: destroy the PE on removal of PCI devices
- *
- * Remove all of the PCI devices under this bus both from the
- * linux pci device tree, and from the powerpc EEH address cache.
- * By default, the corresponding PE will be destroied during the
- * normal PCI hotplug path. For PCI hotplug during EEH recovery,
- * the corresponding PE won't be destroied and deallocated.
- */
-void __pcibios_remove_pci_devices(struct pci_bus *bus, int purge_pe)
-{
-       struct pci_dev *dev, *tmp;
-       struct pci_bus *child_bus;
-
-       /* First go down child busses */
-       list_for_each_entry(child_bus, &bus->children, node)
-               __pcibios_remove_pci_devices(child_bus, purge_pe);
-
-       pr_debug("PCI: Removing devices on bus %04x:%02x\n",
-               pci_domain_nr(bus),  bus->number);
-       list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) {
-               pr_debug("     * Removing %s...\n", pci_name(dev));
-               eeh_remove_bus_device(dev, purge_pe);
-               pci_stop_and_remove_bus_device(dev);
-       }
-}
-
-/**
- * pcibios_remove_pci_devices - remove all devices under this bus
- *
- * Remove all of the PCI devices under this bus both from the
- * linux pci device tree, and from the powerpc EEH address cache.
- */
-void pcibios_remove_pci_devices(struct pci_bus *bus)
-{
-       __pcibios_remove_pci_devices(bus, 1);
-}
-EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices);
-
-/**
- * pcibios_add_pci_devices - adds new pci devices to bus
- *
- * This routine will find and fixup new pci devices under
- * the indicated bus. This routine presumes that there
- * might already be some devices under this bridge, so
- * it carefully tries to add only new devices.  (And that
- * is how this routine differs from other, similar pcibios
- * routines.)
- */
-void pcibios_add_pci_devices(struct pci_bus * bus)
-{
-       int slotno, num, mode, pass, max;
-       struct pci_dev *dev;
-       struct device_node *dn = pci_bus_to_OF_node(bus);
-
-       eeh_add_device_tree_early(dn);
-
-       mode = PCI_PROBE_NORMAL;
-       if (ppc_md.pci_probe_mode)
-               mode = ppc_md.pci_probe_mode(bus);
-
-       if (mode == PCI_PROBE_DEVTREE) {
-               /* use ofdt-based probe */
-               of_rescan_bus(dn, bus);
-       } else if (mode == PCI_PROBE_NORMAL) {
-               /* use legacy probe */
-               slotno = PCI_SLOT(PCI_DN(dn->child)->devfn);
-               num = pci_scan_slot(bus, PCI_DEVFN(slotno, 0));
-               if (!num)
-                       return;
-               pcibios_setup_bus_devices(bus);
-               max = bus->busn_res.start;
-               for (pass=0; pass < 2; pass++)
-                       list_for_each_entry(dev, &bus->devices, bus_list) {
-                       if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
-                           dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
-                               max = pci_scan_bridge(bus, dev, max, pass);
-               }
-       }
-       pcibios_finish_adding_to_bus(bus);
-}
-EXPORT_SYMBOL_GPL(pcibios_add_pci_devices);
-
 struct pci_controller *init_phb_dynamic(struct device_node *dn)
 {
        struct pci_controller *phb;
index c4dfccd3a3d90bf6f4a24e0377f2a890eacb57db..7b3cbde8c78378e25fb0d82ca60b58e20ffe51f5 100644 (file)
@@ -83,7 +83,7 @@ static void handle_system_shutdown(char event_modifier)
        switch (event_modifier) {
        case EPOW_SHUTDOWN_NORMAL:
                pr_emerg("Firmware initiated power off");
-               orderly_poweroff(1);
+               orderly_poweroff(true);
                break;
 
        case EPOW_SHUTDOWN_ON_UPS:
@@ -95,13 +95,13 @@ static void handle_system_shutdown(char event_modifier)
                pr_emerg("Loss of system critical functions reported by "
                        "firmware");
                pr_emerg("Check RTAS error log for details");
-               orderly_poweroff(1);
+               orderly_poweroff(true);
                break;
 
        case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH:
                pr_emerg("Ambient temperature too high reported by firmware");
                pr_emerg("Check RTAS error log for details");
-               orderly_poweroff(1);
+               orderly_poweroff(true);
                break;
 
        default:
@@ -162,7 +162,7 @@ void rtas_parse_epow_errlog(struct rtas_error_log *log)
 
        case EPOW_SYSTEM_HALT:
                pr_emerg("Firmware initiated power off");
-               orderly_poweroff(1);
+               orderly_poweroff(true);
                break;
 
        case EPOW_MAIN_ENCLOSURE:
index 12bc8c3663add73a4402074d2fa2919e01d3aa09..306643cc9dbcc6cf8d1a317a84fc98c2c4af0c10 100644 (file)
@@ -192,7 +192,7 @@ static int smp_pSeries_cpu_bootable(unsigned int nr)
        /* Special case - we inhibit secondary thread startup
         * during boot if the user requests it.
         */
-       if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) {
+       if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) {
                if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
                        return 0;
                if (smt_enabled_at_boot
index d4fa03f2b6acc93c20c25e4aad898145866fbf4b..5e6ff38ea69f197a873e9f47923d9f3434107221 100644 (file)
@@ -120,6 +120,7 @@ static irqreturn_t cpm_error_interrupt(int irq, void *dev)
 
 static struct irqaction cpm_error_irqaction = {
        .handler = cpm_error_interrupt,
+       .flags = IRQF_NO_THREAD,
        .name = "error",
 };
 
index e8b6e5b8932c39fa7ff0d824d5c60f4ebe5e5fcf..2080dfeba64b3d6524c737e18ba4aa1ed48afd3e 100644 (file)
@@ -1370,10 +1370,11 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 
 #define __HAVE_ARCH_PGTABLE_DEPOSIT
-extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+                                      pgtable_t pgtable);
 
 #define __HAVE_ARCH_PGTABLE_WITHDRAW
-extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 
 static inline int pmd_trans_splitting(pmd_t pmd)
 {
index a938b548f07e2d18c5510dbe99e47cdf231265a8..1ccbffecc4d5db2c32ea5583884ec44c71571bab 100644 (file)
@@ -1117,7 +1117,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
        }
 }
 
-void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+                               pgtable_t pgtable)
 {
        struct list_head *lh = (struct list_head *) pgtable;
 
@@ -1131,7 +1132,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
        mm->pmd_huge_pte = pgtable;
 }
 
-pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 {
        struct list_head *lh;
        pgtable_t pgtable;
index 7619f2f792aff549905ca49d1a70b3cd7514979c..d22b92d67844e808f8fc9d1e101dffe2a7bea61b 100644 (file)
@@ -853,10 +853,11 @@ extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
                                 pmd_t *pmd);
 
 #define __HAVE_ARCH_PGTABLE_DEPOSIT
-extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+                                      pgtable_t pgtable);
 
 #define __HAVE_ARCH_PGTABLE_WITHDRAW
-extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 #endif
 
 /* Encode and de-code a swap entry */
index 37e7bc4c95b373aad3c9dbac1f16fe172b205890..7a91f288c7081229a6c2d2a4a1385a9673c58fc5 100644 (file)
@@ -188,7 +188,8 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
        }
 }
 
-void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+                               pgtable_t pgtable)
 {
        struct list_head *lh = (struct list_head *) pgtable;
 
@@ -202,7 +203,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
        mm->pmd_huge_pte = pgtable;
 }
 
-pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 {
        struct list_head *lh;
        pgtable_t pgtable;
index c332fb98480d28e42739e9b674b7b99ce8be427d..01730b2b9954e21bd69208f51847b11f78246f5c 100644 (file)
@@ -261,4 +261,12 @@ config SHMOBILE_IOMMU_L1SIZE
        default 256 if SHMOBILE_IOMMU_ADDRSIZE_64MB
        default 128 if SHMOBILE_IOMMU_ADDRSIZE_32MB
 
+config SPAPR_TCE_IOMMU
+       bool "sPAPR TCE IOMMU Support"
+       depends on PPC_POWERNV || PPC_PSERIES
+       select IOMMU_API
+       help
+         Enables bits of IOMMU API required by VFIO. The iommu_ops
+         is not implemented as it is not necessary for VFIO.
+
 endif # IOMMU_SUPPORT
index b026896206ca4fb1ee66df1df93036dcc7b72f9b..04a50498f2576f8019488e118c49d4b7092f0fc4 100644 (file)
@@ -697,7 +697,7 @@ static ssize_t adb_read(struct file *file, char __user *buf,
        int ret = 0;
        struct adbdev_state *state = file->private_data;
        struct adb_request *req;
-       wait_queue_t wait = __WAITQUEUE_INITIALIZER(wait,current);
+       DECLARE_WAITQUEUE(wait,current);
        unsigned long flags;
 
        if (count < 2)
index 6a82388505f0a83a6fb218d14edf039595a0a92b..80d30e8e33891fded712ff97a9240e8c68c618e4 100644 (file)
@@ -181,7 +181,7 @@ static void mac_hid_stop_emulation(void)
        mac_hid_destroy_emumouse();
 }
 
-static int mac_hid_toggle_emumouse(ctl_table *table, int write,
+static int mac_hid_toggle_emumouse(struct ctl_table *table, int write,
                                   void __user *buffer, size_t *lenp,
                                   loff_t *ppos)
 {
@@ -214,7 +214,7 @@ static int mac_hid_toggle_emumouse(ctl_table *table, int write,
 }
 
 /* file(s) in /proc/sys/dev/mac_hid */
-static ctl_table mac_hid_files[] = {
+static struct ctl_table mac_hid_files[] = {
        {
                .procname       = "mouse_button_emulation",
                .data           = &mouse_emulate_buttons,
@@ -240,7 +240,7 @@ static ctl_table mac_hid_files[] = {
 };
 
 /* dir in /proc/sys/dev */
-static ctl_table mac_hid_dir[] = {
+static struct ctl_table mac_hid_dir[] = {
        {
                .procname       = "mac_hid",
                .maxlen         = 0,
@@ -251,7 +251,7 @@ static ctl_table mac_hid_dir[] = {
 };
 
 /* /proc/sys/dev itself, in case that is not there yet */
-static ctl_table mac_hid_root_dir[] = {
+static struct ctl_table mac_hid_root_dir[] = {
        {
                .procname       = "dev",
                .maxlen         = 0,
index 86511c570dd8c16cade3c153c3f1a1d497fdc573..d61f271d22078ac13166e49a79d9c03c94d5f71c 100644 (file)
@@ -259,7 +259,7 @@ cuda_probe(void)
     } while (0)
 
 static int
-cuda_init_via(void)
+__init cuda_init_via(void)
 {
     out_8(&via[DIRB], (in_8(&via[DIRB]) | TACK | TIP) & ~TREQ);        /* TACK & TIP out */
     out_8(&via[B], in_8(&via[B]) | TACK | TIP);                        /* negate them */
index af605e915d4196422681b6cd809d8aee3eadf600..7fe58b0ae8b4713a5cc135cf914160fc0e9e9883 100644 (file)
@@ -276,6 +276,7 @@ static const char *loop_names[N_LOOPS] = {
 
 static unsigned int pm121_failure_state;
 static int pm121_readjust, pm121_skipping;
+static bool pm121_overtemp;
 static s32 average_power;
 
 struct pm121_correction {
@@ -847,6 +848,7 @@ static void pm121_tick(void)
        if (new_failure & FAILURE_OVERTEMP) {
                wf_set_overtemp();
                pm121_skipping = 2;
+               pm121_overtemp = true;
        }
 
        /* We only clear the overtemp condition if overtemp is cleared
@@ -855,8 +857,10 @@ static void pm121_tick(void)
         * the control loop levels, but we don't want to keep it clear
         * here in this case
         */
-       if (new_failure == 0 && last_failure & FAILURE_OVERTEMP)
+       if (!pm121_failure_state && pm121_overtemp) {
                wf_clear_overtemp();
+               pm121_overtemp = false;
+       }
 }
 
 
index f84933ff32988abac0ca8511b997f188dc9f226e..2a5e1b15b1d2e17bd466f98edadc9b7e54ddf621 100644 (file)
@@ -149,6 +149,7 @@ static int wf_smu_all_controls_ok, wf_smu_all_sensors_ok, wf_smu_started;
 
 static unsigned int wf_smu_failure_state;
 static int wf_smu_readjust, wf_smu_skipping;
+static bool wf_smu_overtemp;
 
 /*
  * ****** System Fans Control Loop ******
@@ -593,6 +594,7 @@ static void wf_smu_tick(void)
        if (new_failure & FAILURE_OVERTEMP) {
                wf_set_overtemp();
                wf_smu_skipping = 2;
+               wf_smu_overtemp = true;
        }
 
        /* We only clear the overtemp condition if overtemp is cleared
@@ -601,8 +603,10 @@ static void wf_smu_tick(void)
         * the control loop levels, but we don't want to keep it clear
         * here in this case
         */
-       if (new_failure == 0 && last_failure & FAILURE_OVERTEMP)
+       if (!wf_smu_failure_state && wf_smu_overtemp) {
                wf_clear_overtemp();
+               wf_smu_overtemp = false;
+       }
 }
 
 static void wf_smu_new_control(struct wf_control *ct)
index 2eb484f213c84086655aae5adbedf9cebbb98cf0..a8ac66cd3b13b3b9e92f514e1ee5d70221dcd331 100644 (file)
@@ -76,6 +76,7 @@ static struct wf_control *cpufreq_clamp;
 
 /* Set to kick the control loop into life */
 static int wf_smu_all_controls_ok, wf_smu_all_sensors_ok, wf_smu_started;
+static bool wf_smu_overtemp;
 
 /* Failure handling.. could be nicer */
 #define FAILURE_FAN            0x01
@@ -517,6 +518,7 @@ static void wf_smu_tick(void)
        if (new_failure & FAILURE_OVERTEMP) {
                wf_set_overtemp();
                wf_smu_skipping = 2;
+               wf_smu_overtemp = true;
        }
 
        /* We only clear the overtemp condition if overtemp is cleared
@@ -525,8 +527,10 @@ static void wf_smu_tick(void)
         * the control loop levels, but we don't want to keep it clear
         * here in this case
         */
-       if (new_failure == 0 && last_failure & FAILURE_OVERTEMP)
+       if (!wf_smu_failure_state && wf_smu_overtemp) {
                wf_clear_overtemp();
+               wf_smu_overtemp = false;
+       }
 }
 
 
index d87f5ee04ca9b3c1638e99c7a0655256d7fc8e66..ad6223e8834043ba710c9d1bd5c41ae2a156dfbe 100644 (file)
@@ -343,7 +343,6 @@ static int wf_sat_remove(struct i2c_client *client)
                wf_unregister_sensor(&sens->sens);
        }
        sat->i2c = NULL;
-       i2c_set_clientdata(client, NULL);
        kref_put(&sat->ref, wf_sat_release);
 
        return 0;
index 7cd5dec0abd1e44a3b8eabf091b6cd42204d6877..26b3d9d1409f9e349309c5cce7dda183bea268fa 100644 (file)
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
        depends on VFIO
        default n
 
+config VFIO_IOMMU_SPAPR_TCE
+       tristate
+       depends on VFIO && SPAPR_TCE_IOMMU
+       default n
+
 menuconfig VFIO
        tristate "VFIO Non-Privileged userspace driver framework"
        depends on IOMMU_API
        select VFIO_IOMMU_TYPE1 if X86
+       select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES)
        help
          VFIO provides a framework for secure userspace device drivers.
          See Documentation/vfio.txt for more details.
index 2398d4a0e38b9d93f2bb7dae81d713b50b273751..72bfabc8629e9cc5cc09276d40de96f7991459a3 100644 (file)
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
index 6d78736563de748c319ade5c0ff5cf210028fc23..259ad282ae5dc129ba88882493e444d517a1f582 100644 (file)
@@ -1415,6 +1415,7 @@ static int __init vfio_init(void)
         * drivers.
         */
        request_module_nowait("vfio_iommu_type1");
+       request_module_nowait("vfio_iommu_spapr_tce");
 
        return 0;
 
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644 (file)
index 0000000..bdae7a0
--- /dev/null
@@ -0,0 +1,377 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2013 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+               struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ *
+ * This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU
+ */
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+       struct mutex lock;
+       struct iommu_table *tbl;
+       bool enabled;
+};
+
+static int tce_iommu_enable(struct tce_container *container)
+{
+       int ret = 0;
+       unsigned long locked, lock_limit, npages;
+       struct iommu_table *tbl = container->tbl;
+
+       if (!container->tbl)
+               return -ENXIO;
+
+       if (!current->mm)
+               return -ESRCH; /* process exited */
+
+       if (container->enabled)
+               return -EBUSY;
+
+       /*
+        * When userspace pages are mapped into the IOMMU, they are effectively
+        * locked memory, so, theoretically, we need to update the accounting
+        * of locked pages on each map and unmap.  For powerpc, the map unmap
+        * paths can be very hot, though, and the accounting would kill
+        * performance, especially since it would be difficult to impossible
+        * to handle the accounting in real mode only.
+        *
+        * To address that, rather than precisely accounting every page, we
+        * instead account for a worst case on locked memory when the iommu is
+        * enabled and disabled.  The worst case upper bound on locked memory
+        * is the size of the whole iommu window, which is usually relatively
+        * small (compared to total memory sizes) on POWER hardware.
+        *
+        * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
+        * that would effectively kill the guest at random points, much better
+        * enforcing the limit based on the max that the guest can map.
+        */
+       down_write(&current->mm->mmap_sem);
+       npages = (tbl->it_size << IOMMU_PAGE_SHIFT) >> PAGE_SHIFT;
+       locked = current->mm->locked_vm + npages;
+       lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+       if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+               pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
+                               rlimit(RLIMIT_MEMLOCK));
+               ret = -ENOMEM;
+       } else {
+
+               current->mm->locked_vm += npages;
+               container->enabled = true;
+       }
+       up_write(&current->mm->mmap_sem);
+
+       return ret;
+}
+
+static void tce_iommu_disable(struct tce_container *container)
+{
+       if (!container->enabled)
+               return;
+
+       container->enabled = false;
+
+       if (!container->tbl || !current->mm)
+               return;
+
+       down_write(&current->mm->mmap_sem);
+       current->mm->locked_vm -= (container->tbl->it_size <<
+                       IOMMU_PAGE_SHIFT) >> PAGE_SHIFT;
+       up_write(&current->mm->mmap_sem);
+}
+
+static void *tce_iommu_open(unsigned long arg)
+{
+       struct tce_container *container;
+
+       if (arg != VFIO_SPAPR_TCE_IOMMU) {
+               pr_err("tce_vfio: Wrong IOMMU type\n");
+               return ERR_PTR(-EINVAL);
+       }
+
+       container = kzalloc(sizeof(*container), GFP_KERNEL);
+       if (!container)
+               return ERR_PTR(-ENOMEM);
+
+       mutex_init(&container->lock);
+
+       return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+       struct tce_container *container = iommu_data;
+
+       WARN_ON(container->tbl && !container->tbl->it_group);
+       tce_iommu_disable(container);
+
+       if (container->tbl && container->tbl->it_group)
+               tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+       mutex_destroy(&container->lock);
+
+       kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+                                unsigned int cmd, unsigned long arg)
+{
+       struct tce_container *container = iommu_data;
+       unsigned long minsz;
+       long ret;
+
+       switch (cmd) {
+       case VFIO_CHECK_EXTENSION:
+               return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+
+       case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+               struct vfio_iommu_spapr_tce_info info;
+               struct iommu_table *tbl = container->tbl;
+
+               if (WARN_ON(!tbl))
+                       return -ENXIO;
+
+               minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+                               dma32_window_size);
+
+               if (copy_from_user(&info, (void __user *)arg, minsz))
+                       return -EFAULT;
+
+               if (info.argsz < minsz)
+                       return -EINVAL;
+
+               info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+               info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+               info.flags = 0;
+
+               if (copy_to_user((void __user *)arg, &info, minsz))
+                       return -EFAULT;
+
+               return 0;
+       }
+       case VFIO_IOMMU_MAP_DMA: {
+               struct vfio_iommu_type1_dma_map param;
+               struct iommu_table *tbl = container->tbl;
+               unsigned long tce, i;
+
+               if (!tbl)
+                       return -ENXIO;
+
+               BUG_ON(!tbl->it_group);
+
+               minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
+
+               if (copy_from_user(&param, (void __user *)arg, minsz))
+                       return -EFAULT;
+
+               if (param.argsz < minsz)
+                       return -EINVAL;
+
+               if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
+                               VFIO_DMA_MAP_FLAG_WRITE))
+                       return -EINVAL;
+
+               if ((param.size & ~IOMMU_PAGE_MASK) ||
+                               (param.vaddr & ~IOMMU_PAGE_MASK))
+                       return -EINVAL;
+
+               /* iova is checked by the IOMMU API */
+               tce = param.vaddr;
+               if (param.flags & VFIO_DMA_MAP_FLAG_READ)
+                       tce |= TCE_PCI_READ;
+               if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+                       tce |= TCE_PCI_WRITE;
+
+               ret = iommu_tce_put_param_check(tbl, param.iova, tce);
+               if (ret)
+                       return ret;
+
+               for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT); ++i) {
+                       ret = iommu_put_tce_user_mode(tbl,
+                                       (param.iova >> IOMMU_PAGE_SHIFT) + i,
+                                       tce);
+                       if (ret)
+                               break;
+                       tce += IOMMU_PAGE_SIZE;
+               }
+               if (ret)
+                       iommu_clear_tces_and_put_pages(tbl,
+                                       param.iova >> IOMMU_PAGE_SHIFT, i);
+
+               iommu_flush_tce(tbl);
+
+               return ret;
+       }
+       case VFIO_IOMMU_UNMAP_DMA: {
+               struct vfio_iommu_type1_dma_unmap param;
+               struct iommu_table *tbl = container->tbl;
+
+               if (WARN_ON(!tbl))
+                       return -ENXIO;
+
+               minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
+                               size);
+
+               if (copy_from_user(&param, (void __user *)arg, minsz))
+                       return -EFAULT;
+
+               if (param.argsz < minsz)
+                       return -EINVAL;
+
+               /* No flag is supported now */
+               if (param.flags)
+                       return -EINVAL;
+
+               if (param.size & ~IOMMU_PAGE_MASK)
+                       return -EINVAL;
+
+               ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
+                               param.size >> IOMMU_PAGE_SHIFT);
+               if (ret)
+                       return ret;
+
+               ret = iommu_clear_tces_and_put_pages(tbl,
+                               param.iova >> IOMMU_PAGE_SHIFT,
+                               param.size >> IOMMU_PAGE_SHIFT);
+               iommu_flush_tce(tbl);
+
+               return ret;
+       }
+       case VFIO_IOMMU_ENABLE:
+               mutex_lock(&container->lock);
+               ret = tce_iommu_enable(container);
+               mutex_unlock(&container->lock);
+               return ret;
+
+
+       case VFIO_IOMMU_DISABLE:
+               mutex_lock(&container->lock);
+               tce_iommu_disable(container);
+               mutex_unlock(&container->lock);
+               return 0;
+       }
+
+       return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+               struct iommu_group *iommu_group)
+{
+       int ret;
+       struct tce_container *container = iommu_data;
+       struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+       BUG_ON(!tbl);
+       mutex_lock(&container->lock);
+
+       /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+                       iommu_group_id(iommu_group), iommu_group); */
+       if (container->tbl) {
+               pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+                               iommu_group_id(container->tbl->it_group),
+                               iommu_group_id(iommu_group));
+               ret = -EBUSY;
+       } else if (container->enabled) {
+               pr_err("tce_vfio: attaching group #%u to enabled container\n",
+                               iommu_group_id(iommu_group));
+               ret = -EBUSY;
+       } else {
+               ret = iommu_take_ownership(tbl);
+               if (!ret)
+                       container->tbl = tbl;
+       }
+
+       mutex_unlock(&container->lock);
+
+       return ret;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+               struct iommu_group *iommu_group)
+{
+       struct tce_container *container = iommu_data;
+       struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+       BUG_ON(!tbl);
+       mutex_lock(&container->lock);
+       if (tbl != container->tbl) {
+               pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
+                               iommu_group_id(iommu_group),
+                               iommu_group_id(tbl->it_group));
+       } else {
+               if (container->enabled) {
+                       pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
+                                       iommu_group_id(tbl->it_group));
+                       tce_iommu_disable(container);
+               }
+
+               /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+                               iommu_group_id(iommu_group), iommu_group); */
+               container->tbl = NULL;
+               iommu_release_ownership(tbl);
+       }
+       mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+       .name           = "iommu-vfio-powerpc",
+       .owner          = THIS_MODULE,
+       .open           = tce_iommu_open,
+       .release        = tce_iommu_release,
+       .ioctl          = tce_iommu_ioctl,
+       .attach_group   = tce_iommu_attach_group,
+       .detach_group   = tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+       return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+       vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
index e4bcb2cf055a1dc8f81df7cbb38e8cdab4add0b0..08c3d76b24ca8aa021b3440c89f999f37bb9a156 100644 (file)
@@ -324,6 +324,15 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
        case PSTORE_TYPE_MCE:
                sprintf(name, "mce-%s-%lld", psname, id);
                break;
+       case PSTORE_TYPE_PPC_RTAS:
+               sprintf(name, "rtas-%s-%lld", psname, id);
+               break;
+       case PSTORE_TYPE_PPC_OF:
+               sprintf(name, "powerpc-ofw-%s-%lld", psname, id);
+               break;
+       case PSTORE_TYPE_PPC_COMMON:
+               sprintf(name, "powerpc-common-%s-%lld", psname, id);
+               break;
        case PSTORE_TYPE_UNKNOWN:
                sprintf(name, "unknown-%s-%lld", psname, id);
                break;
index a59ff51b016695f54095e753cbfc2a5a6b684684..18e27c210716870ad86477b9dd05cfb098e99eb6 100644 (file)
@@ -173,11 +173,12 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma,
 #endif
 
 #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
-extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+                                      pgtable_t pgtable);
 #endif
 
 #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
-extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 #endif
 
 #ifndef __HAVE_ARCH_PMDP_INVALIDATE
index 528454c2caa91b17cbb5761de006418909de2046..e2dbefb38e3bbc8bec6f2f36bc50521e5e8b8c44 100644 (file)
@@ -60,9 +60,9 @@ extern pmd_t *page_check_address_pmd(struct page *page,
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define HPAGE_PMD_SHIFT HPAGE_SHIFT
-#define HPAGE_PMD_MASK HPAGE_MASK
-#define HPAGE_PMD_SIZE HPAGE_SIZE
+#define HPAGE_PMD_SHIFT PMD_SHIFT
+#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT)
+#define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1))
 
 extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
 
index 75d01760c91197e0a44066773b52af5bc063349b..656699fcc7d7fca0e9c8641466c87f1f9d16cfc5 100644 (file)
@@ -35,6 +35,10 @@ enum pstore_type_id {
        PSTORE_TYPE_MCE         = 1,
        PSTORE_TYPE_CONSOLE     = 2,
        PSTORE_TYPE_FTRACE      = 3,
+       /* PPC64 partition types */
+       PSTORE_TYPE_PPC_RTAS    = 4,
+       PSTORE_TYPE_PPC_OF      = 5,
+       PSTORE_TYPE_PPC_COMMON  = 6,
        PSTORE_TYPE_UNKNOWN     = 255
 };
 
index 284ff243682976e6b9fcd27be9d44d9721748092..87ee4f4cff250af6aff36e502cb484a2ce8a9bba 100644 (file)
@@ -22,6 +22,7 @@
 /* Extensions */
 
 #define VFIO_TYPE1_IOMMU               1
+#define VFIO_SPAPR_TCE_IOMMU           2
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
@@ -375,4 +376,37 @@ struct vfio_iommu_type1_dma_unmap {
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/*
+ * IOCTLs to enable/disable IOMMU container usage.
+ * No parameters are supported.
+ */
+#define VFIO_IOMMU_ENABLE      _IO(VFIO_TYPE, VFIO_BASE + 15)
+#define VFIO_IOMMU_DISABLE     _IO(VFIO_TYPE, VFIO_BASE + 16)
+
+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+/*
+ * The SPAPR TCE info struct provides the information about the PCI bus
+ * address ranges available for DMA, these values are programmed into
+ * the hardware so the guest has to know that information.
+ *
+ * The DMA 32 bit window start is an absolute PCI bus address.
+ * The IOVA address passed via map/unmap ioctls are absolute PCI bus
+ * addresses too so the window works as a filter rather than an offset
+ * for IOVA addresses.
+ *
+ * A flag will need to be added if other page sizes are supported,
+ * so as defined here, it is always 4k.
+ */
+struct vfio_iommu_spapr_tce_info {
+       __u32 argsz;
+       __u32 flags;                    /* reserved for future use */
+       __u32 dma32_window_start;       /* 32 bit window start (bytes) */
+       __u32 dma32_window_size;        /* 32 bit window size (bytes) */
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO  _IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/* ***************************************************************** */
+
 #endif /* _UAPIVFIO_H */
index 362c329b83fe7441b4d2119c1e164a54c58fc860..59d9384b6bbfe3392b9c7b7b9f901a8fc89a46c8 100644 (file)
@@ -729,8 +729,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                pmd_t entry;
                entry = mk_huge_pmd(page, vma);
                page_add_new_anon_rmap(page, vma, haddr);
+               pgtable_trans_huge_deposit(mm, pmd, pgtable);
                set_pmd_at(mm, haddr, pmd, entry);
-               pgtable_trans_huge_deposit(mm, pgtable);
                add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
                mm->nr_ptes++;
                spin_unlock(&mm->page_table_lock);
@@ -771,8 +771,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
        entry = mk_pmd(zero_page, vma->vm_page_prot);
        entry = pmd_wrprotect(entry);
        entry = pmd_mkhuge(entry);
+       pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, haddr, pmd, entry);
-       pgtable_trans_huge_deposit(mm, pgtable);
        mm->nr_ptes++;
        return true;
 }
@@ -916,8 +916,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
        pmdp_set_wrprotect(src_mm, addr, src_pmd);
        pmd = pmd_mkold(pmd_wrprotect(pmd));
+       pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-       pgtable_trans_huge_deposit(dst_mm, pgtable);
        dst_mm->nr_ptes++;
 
        ret = 0;
@@ -987,7 +987,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
        pmdp_clear_flush(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
 
-       pgtable = pgtable_trans_huge_withdraw(mm);
+       pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);
 
        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1085,7 +1085,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        pmdp_clear_flush(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
 
-       pgtable = pgtable_trans_huge_withdraw(mm);
+       pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);
 
        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1265,7 +1265,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                 * young bit, instead of the current set_pmd_at.
                 */
                _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
-               set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
+               if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
+                                         pmd, _pmd,  1))
+                       update_mmu_cache_pmd(vma, addr, pmd);
        }
        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
                if (page->mapping && trylock_page(page)) {
@@ -1358,9 +1360,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                struct page *page;
                pgtable_t pgtable;
                pmd_t orig_pmd;
-               pgtable = pgtable_trans_huge_withdraw(tlb->mm);
+               /*
+                * For architectures like ppc64 we look at deposited pgtable
+                * when calling pmdp_get_and_clear. So do the
+                * pgtable_trans_huge_withdraw after finishing pmdp related
+                * operations.
+                */
                orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+               pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
                if (is_huge_zero_pmd(orig_pmd)) {
                        tlb->mm->nr_ptes--;
                        spin_unlock(&tlb->mm->page_table_lock);
@@ -1691,7 +1699,7 @@ static int __split_huge_page_map(struct page *page,
        pmd = page_check_address_pmd(page, mm, address,
                                     PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
        if (pmd) {
-               pgtable = pgtable_trans_huge_withdraw(mm);
+               pgtable = pgtable_trans_huge_withdraw(mm, pmd);
                pmd_populate(mm, &_pmd, pgtable);
 
                haddr = address;
@@ -2359,9 +2367,9 @@ static void collapse_huge_page(struct mm_struct *mm,
        spin_lock(&mm->page_table_lock);
        BUG_ON(!pmd_none(*pmd));
        page_add_new_anon_rmap(new_page, vma, address);
+       pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, address, pmd, _pmd);
        update_mmu_cache_pmd(vma, address, pmd);
-       pgtable_trans_huge_deposit(mm, pgtable);
        spin_unlock(&mm->page_table_lock);
 
        *hpage = NULL;
@@ -2667,7 +2675,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
        pmdp_clear_flush(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
 
-       pgtable = pgtable_trans_huge_withdraw(mm);
+       pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);
 
        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
index 0c8323fe6c8f610b4068a2232eb4459c23f765f6..e1a6e4fab016200e94ec2c9e96b1c7f88281ed22 100644 (file)
@@ -124,7 +124,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
 
 #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+                               pgtable_t pgtable)
 {
        assert_spin_locked(&mm->page_table_lock);
 
@@ -141,7 +142,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
 #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /* no "address" argument so destroys page coloring of some arch */
-pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 {
        pgtable_t pgtable;