Merge remote-tracking branch 'tip/auto-latest'

author Stephen Rothwell <sfr@canb.auug.org.au>

Wed, 5 Sep 2012 03:16:58 +0000 (13:16 +1000)

committer Stephen Rothwell <sfr@canb.auug.org.au>

Wed, 5 Sep 2012 03:16:58 +0000 (13:16 +1000)
author Stephen Rothwell <sfr@canb.auug.org.au>
Wed, 5 Sep 2012 03:16:58 +0000 (13:16 +1000)
committer Stephen Rothwell <sfr@canb.auug.org.au>
Wed, 5 Sep 2012 03:16:58 +0000 (13:16 +1000)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index 2cb2d81a51abe2fdc6114005a07b37398f02f9bd..75da0297330a2a2debb9d380fba4cbe8844ed02b 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2641,9 +2641,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
         smart2=         [HW]
                         Format: <io1>[,<io2>[,...,<io8>]]
  
-       smp-alt-once    [X86-32,SMP] On a hotplug CPU system, only
-                       attempt to substitute SMP alternatives once at boot.
-
         smsc-ircc2.nopnp        [HW] Don't use PNP to discover SMC devices
         smsc-ircc2.ircc_cfg=    [HW] Device configuration I/O port
         smsc-ircc2.ircc_sir=    [HW] SIR base I/O port
diff --git a/MAINTAINERS b/MAINTAINERS

index caec25c4aebd9e5b7c9885d53368f52609e49b07..078d7814ab0d677ea946e98829e7a6e57b9260e1 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4643,6 +4643,14 @@ L:       alsa-devel@alsa-project.org
  W:     http://www.native-instruments.com
  F:     sound/usb/caiaq/
  
+NATIVE LINUX KVM TOOL
+M:     Pekka Enberg <penberg@kernel.org>
+M:     Sasha Levin <levinsasha928@gmail.com>
+M:     Asias He <asias.hejun@gmail.com>
+L:     kvm@vger.kernel.org
+S:     Maintained
+F:     tools/kvm/
+
  NCP FILESYSTEM
  M:     Petr Vandrovec <petr@vandrovec.name>
  S:     Odd Fixes
diff --git a/Makefile b/Makefile

index fa9cfab148cdf1178f7b719818c99451f75b1407..d1423e7b23afe0958e377bb4542b4b623ffc69e3 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -609,7 +609,11 @@ KBUILD_CFLAGS      += $(call cc-option, -femit-struct-debug-baseonly)
  endif
  
  ifdef CONFIG_FUNCTION_TRACER
-KBUILD_CFLAGS  += -pg
+ifdef CONFIG_HAVE_FENTRY
+CC_USING_FENTRY        := $(call cc-option, -mfentry -DCC_USING_FENTRY)
+endif
+KBUILD_CFLAGS  += -pg $(CC_USING_FENTRY)
+KBUILD_AFLAGS  += $(CC_USING_FENTRY)
  ifdef CONFIG_DYNAMIC_FTRACE
         ifdef CONFIG_HAVE_C_RECORDMCOUNT
                 BUILD_C_RECORDMCOUNT := y
diff --git a/arch/Kconfig b/arch/Kconfig

index 3450115c64373da128ba7be90de61b8fd7d8f160..07db9299ea86e802374eb18176622d630f14fcb8 100644 (file)
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -222,6 +222,19 @@ config HAVE_PERF_EVENTS_NMI
           subsystem.  Also has support for calculating CPU cycle events
           to determine how many clock cycles in a given period.
  
+config HAVE_PERF_REGS
+       bool
+       help
+         Support selective register dumps for perf events. This includes
+         bit-mapping of each registers and a unique architecture id.
+
+config HAVE_PERF_USER_STACK_DUMP
+       bool
+       help
+         Support user stack dumps for perf event samples. This needs
+         access to the user stack pointer which is not unified across
+         architectures.
+
  config HAVE_ARCH_JUMP_LABEL
         bool
  
@@ -300,4 +313,7 @@ config MODULES_USE_ELF_REL
           Modules only use ELF REL relocations.  Modules with ELF RELA
           relocations will give an error.
  
+config HAVE_VIRT_CPU_ACCOUNTING
+       bool
+
  source "kernel/gcov/Kconfig"
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig

index 688146466d0dc54117e9aa809c713adcff3926c1..ac5964cf2739762305e549bd68bf2cd770c241ea 100644 (file)
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -25,6 +25,7 @@ config IA64
         select HAVE_GENERIC_HARDIRQS
         select HAVE_MEMBLOCK
         select HAVE_MEMBLOCK_NODE_MAP
+       select HAVE_VIRT_CPU_ACCOUNTING
         select ARCH_DISCARD_MEMBLOCK
         select GENERIC_IRQ_PROBE
         select GENERIC_PENDING_IRQ if SMP
@@ -342,17 +343,6 @@ config FORCE_MAX_ZONEORDER
         default "17" if HUGETLB_PAGE
         default "11"
  
-config VIRT_CPU_ACCOUNTING
-       bool "Deterministic task and CPU time accounting"
-       default n
-       help
-         Select this option to enable more accurate task and CPU time
-         accounting.  This is done by reading a CPU counter on each
-         kernel entry and exit and on transitions within the kernel
-         between system, softirq and hardirq state, so there is a
-         small performance impact.
-         If in doubt, say N here.
-
  config SMP
         bool "Symmetric multi-processing support"
         select USE_GENERIC_SMP_HELPERS
diff --git a/arch/ia64/include/asm/switch_to.h b/arch/ia64/include/asm/switch_to.h

index cb2412fcd17f234af87600094507fa230cba51b4..d38c7ea5eea5d129a32132583c2bbcb8a319bcb9 100644 (file)
--- a/arch/ia64/include/asm/switch_to.h
+++ b/arch/ia64/include/asm/switch_to.h
@@ -30,13 +30,6 @@ extern struct task_struct *ia64_switch_to (void *next_task);
  extern void ia64_save_extra (struct task_struct *task);
  extern void ia64_load_extra (struct task_struct *task);
  
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct *next);
-# define IA64_ACCOUNT_ON_SWITCH(p,n) ia64_account_on_switch(p,n)
-#else
-# define IA64_ACCOUNT_ON_SWITCH(p,n)
-#endif
-
  #ifdef CONFIG_PERFMON
    DECLARE_PER_CPU(unsigned long, pfm_syst_info);
  # define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1)
@@ -49,7 +42,6 @@ extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct
          || PERFMON_IS_SYSWIDE())
  
  #define __switch_to(prev,next,last) do {                                                        \
-       IA64_ACCOUNT_ON_SWITCH(prev, next);                                                      \
         if (IA64_HAS_EXTRA_STATE(prev))                                                          \
                 ia64_save_extra(prev);                                                           \
         if (IA64_HAS_EXTRA_STATE(next))                                                          \
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c

index ecc904b33c5f2935fa1285d8f4cdac9515a3a84d..6247197b987782341bdc70465418094019dacb93 100644 (file)
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -88,10 +88,10 @@ extern cputime_t cycle_to_cputime(u64 cyc);
   * accumulated times to the current process, and to prepare accounting on
   * the next process.
   */
-void ia64_account_on_switch(struct task_struct *prev, struct task_struct *next)
+void account_switch_vtime(struct task_struct *prev)
  {
         struct thread_info *pi = task_thread_info(prev);
-       struct thread_info *ni = task_thread_info(next);
+       struct thread_info *ni = task_thread_info(current);
         cputime_t delta_stime, delta_utime;
         __u64 now;
  
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h

index 3b4b4a8da922fc4801d51bae86fc8e2b8e810108..c1f267694acbecd7f072245f57199faa0594e3c6 100644 (file)
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -197,12 +197,6 @@ struct cpu_usage {
  
  DECLARE_PER_CPU(struct cpu_usage, cpu_usage_array);
  
-#if defined(CONFIG_VIRT_CPU_ACCOUNTING)
-#define account_process_vtime(tsk)             account_process_tick(tsk, 0)
-#else
-#define account_process_vtime(tsk)             do { } while (0)
-#endif
-
  extern void secondary_cpu_time_init(void);
  
  DECLARE_PER_CPU(u64, decrementers_next_tb);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c

index 710f400476deb8461a98a1342c6ee1b669fbe686..d73fa999b47bdc775a1848e5e25012c0316955d0 100644 (file)
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -514,9 +514,6 @@ struct task_struct *__switch_to(struct task_struct *prev,
  
         local_irq_save(flags);
  
-       account_system_vtime(current);
-       account_process_vtime(current);
-
         /*
          * We can't take a PMU exception inside _switch() since there is a
          * window where the kernel stack SLB and the kernel stack are out
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c

index be171ee73bf8cd3bef83630fbc031ad4f29410e7..49da7f06e643d6474691764003e4ea0ac2cd6f43 100644 (file)
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -366,6 +366,12 @@ void account_process_tick(struct task_struct *tsk, int user_tick)
         account_user_time(tsk, utime, utimescaled);
  }
  
+void account_switch_vtime(struct task_struct *prev)
+{
+       account_system_vtime(prev);
+       account_process_tick(prev, 0);
+}
+
  #else /* ! CONFIG_VIRT_CPU_ACCOUNTING */
  #define calc_cputime_factors()
  #endif
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype

index 30fd01de6bed5d93cb83fb380787c2a3586ccdd6..72afd2888cad7c462d2d9a8a3ac2fb30ee6fd7bf 100644 (file)
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -1,6 +1,7 @@
  config PPC64
         bool "64-bit kernel"
         default n
+       select HAVE_VIRT_CPU_ACCOUNTING
         help
           This option selects whether a 32-bit or a 64-bit kernel
           will be built.
@@ -337,21 +338,6 @@ config PPC_MM_SLICES
         default y if (!PPC_FSL_BOOK3E && PPC64 && HUGETLB_PAGE) || (PPC_STD_MMU_64 && PPC_64K_PAGES)
         default n
  
-config VIRT_CPU_ACCOUNTING
-       bool "Deterministic task and CPU time accounting"
-       depends on PPC64
-       default y
-       help
-         Select this option to enable more accurate task and CPU time
-         accounting.  This is done by reading a CPU counter on each
-         kernel entry and exit and on transitions within the kernel
-         between system, softirq and hardirq state, so there is a
-         small performance impact.  This also enables accounting of
-         stolen time on logically-partitioned systems running on
-         IBM POWER5-based machines.
-
-         If in doubt, say Y here.
-
  config PPC_HAVE_PMU_SUPPORT
         bool
  
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig

index 99949f6405425f7cb8f06a653fe6133a88dec661..5b29f7ebdd32e4053ebe59a29a16067d8dfdd389 100644 (file)
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -49,9 +49,6 @@ config GENERIC_LOCKBREAK
  config PGSTE
         def_bool y if KVM
  
-config VIRT_CPU_ACCOUNTING
-       def_bool y
-
  config ARCH_SUPPORTS_DEBUG_PAGEALLOC
         def_bool y
  
@@ -96,6 +93,8 @@ config S390
         select HAVE_MEMBLOCK
         select HAVE_MEMBLOCK_NODE_MAP
         select HAVE_CMPXCHG_LOCAL
+       select HAVE_VIRT_CPU_ACCOUNTING
+       select VIRT_CPU_ACCOUNTING
         select ARCH_DISCARD_MEMBLOCK
         select BUILDTIME_EXTABLE_SORT
         select ARCH_INLINE_SPIN_TRYLOCK
diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h

index dc4967b0e0560f072fe78ba433fcae8b7b0239b3..f3a9e0f9270451056c674f854d49bbcd0817ea25 100644 (file)
--- a/arch/s390/include/asm/switch_to.h
+++ b/arch/s390/include/asm/switch_to.h
@@ -91,12 +91,8 @@ static inline void restore_access_regs(unsigned int *acrs)
         prev = __switch_to(prev,next);                                  \
  } while (0)
  
-extern void account_vtime(struct task_struct *, struct task_struct *);
-extern void account_tick_vtime(struct task_struct *);
-
  #define finish_arch_switch(prev) do {                                       \
         set_fs(current->thread.mm_segment);                                  \
-       account_vtime(prev, current);                                        \
  } while (0)
  
  #endif /* __ASM_SWITCH_TO_H */
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c

index d100c92cfcf824c434e419ee0d37ab640b810831..cfb21fdca71c70117921175c1302b83fe8cc682a 100644 (file)
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -99,7 +99,7 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset)
         return virt_timer_forward(user + system);
  }
  
-void account_vtime(struct task_struct *prev, struct task_struct *next)
+void account_switch_vtime(struct task_struct *prev)
  {
         struct thread_info *ti;
  
@@ -107,7 +107,7 @@ void account_vtime(struct task_struct *prev, struct task_struct *next)
         ti = task_thread_info(prev);
         ti->user_timer = S390_lowcore.user_timer;
         ti->system_timer = S390_lowcore.system_timer;
-       ti = task_thread_info(next);
+       ti = task_thread_info(current);
         S390_lowcore.user_timer = ti->user_timer;
         S390_lowcore.system_timer = ti->system_timer;
  }
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h

index 7a7ce390534f2c65df3c24bdb57c9c7a530e9244..d5e86c9f74fd243b2db944dcd7d2d959eb7c8858 100644 (file)
--- a/arch/tile/include/asm/topology.h
+++ b/arch/tile/include/asm/topology.h
@@ -69,7 +69,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
                                 | 1*SD_BALANCE_FORK                     \
                                 | 0*SD_BALANCE_WAKE                     \
                                 | 0*SD_WAKE_AFFINE                      \
-                               | 0*SD_PREFER_LOCAL                     \
                                 | 0*SD_SHARE_CPUPOWER                   \
                                 | 0*SD_SHARE_PKG_RESOURCES              \
                                 | 0*SD_SERIALIZE                        \
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index 01726cbcc73bd23a7d318201247e2b1b8c909648..3edb9442df40f72ab8ac2dcb520b1c256a3ab0fd 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -36,6 +36,7 @@ config X86
         select HAVE_KRETPROBES
         select HAVE_OPTPROBES
         select HAVE_FTRACE_MCOUNT_RECORD
+       select HAVE_FENTRY if X86_64
         select HAVE_C_RECORDMCOUNT
         select HAVE_DYNAMIC_FTRACE
         select HAVE_FUNCTION_TRACER
@@ -60,6 +61,8 @@ config X86
         select HAVE_MIXED_BREAKPOINTS_REGS
         select PERF_EVENTS
         select HAVE_PERF_EVENTS_NMI
+       select HAVE_PERF_REGS
+       select HAVE_PERF_USER_STACK_DUMP
         select ANON_INODES
         select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
         select HAVE_CMPXCHG_LOCAL if !M386
@@ -551,6 +554,37 @@ config SCHED_OMIT_FRAME_POINTER
  
           If in doubt, say "Y".
  
+config KVMTOOL_TEST_ENABLE
+       bool "Enable options to create a bootable tools/kvm/ kernel"
+       select NET
+       select NETDEVICES
+       select PCI
+       select BLOCK
+       select BLK_DEV
+       select NETWORK_FILESYSTEMS
+       select INET
+       select EXPERIMENTAL
+       select SERIAL_8250
+       select SERIAL_8250_CONSOLE
+       select IP_PNP
+       select IP_PNP_DHCP
+       select BINFMT_ELF
+       select PCI_MSI
+       select HAVE_ARCH_KGDB
+       select DEBUG_KERNEL
+       select KGDB
+       select KGDB_SERIAL_CONSOLE
+       select VIRTUALIZATION
+       select VIRTIO
+       select VIRTIO_RING
+       select VIRTIO_PCI
+       select VIRTIO_BLK
+       select VIRTIO_CONSOLE
+       select VIRTIO_NET
+       select 9P_FS
+       select NET_9P
+       select NET_9P_VIRTIO
+
  menuconfig PARAVIRT_GUEST
         bool "Paravirtualized guest support"
         ---help---
@@ -984,25 +1018,25 @@ config X86_REBOOTFIXUPS
           Say N otherwise.
  
  config MICROCODE
-       tristate "/dev/cpu/microcode - microcode support"
+       tristate "CPU microcode loading support"
         select FW_LOADER
         ---help---
+
           If you say Y here, you will be able to update the microcode on
           certain Intel and AMD processors. The Intel support is for the
-         IA32 family, e.g. Pentium Pro, Pentium II, Pentium III,
-         Pentium 4, Xeon etc. The AMD support is for family 0x10 and
-         0x11 processors, e.g. Opteron, Phenom and Turion 64 Ultra.
-         You will obviously need the actual microcode binary data itself
-         which is not shipped with the Linux kernel.
+         IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4,
+         Xeon etc. The AMD support is for families 0x10 and later. You will
+         obviously need the actual microcode binary data itself which is not
+         shipped with the Linux kernel.
  
           This option selects the general module only, you need to select
           at least one vendor specific module as well.
  
-         To compile this driver as a module, choose M here: the
-         module will be called microcode.
+         To compile this driver as a module, choose M here: the module
+         will be called microcode.
  
  config MICROCODE_INTEL
-       bool "Intel microcode patch loading support"
+       bool "Intel microcode loading support"
         depends on MICROCODE
         default MICROCODE
         select FW_LOADER
@@ -1015,7 +1049,7 @@ config MICROCODE_INTEL
           <http://www.urbanmyth.org/microcode/>.
  
  config MICROCODE_AMD
-       bool "AMD microcode patch loading support"
+       bool "AMD microcode loading support"
         depends on MICROCODE
         select FW_LOADER
         ---help---
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S

index b4e15dd6786a166b1fc16b0d24a1bb4f78a7295e..2a017441b8b2ebc665a24a001f5d635f7dbc9955 100644 (file)
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -32,10 +32,6 @@ SYSSEG               = 0x1000                /* historical load address >> 4 */
  #define SVGA_MODE ASK_VGA
  #endif
  
-#ifndef RAMDISK
-#define RAMDISK 0
-#endif
-
  #ifndef ROOT_RDONLY
  #define ROOT_RDONLY 1
  #endif
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h

index 70780689599acf830b322be2cad0f835ac2bac03..444704c8e186b54472e67665b13d7490c749d886 100644 (file)
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -60,7 +60,7 @@ extern void alternatives_smp_module_add(struct module *mod, char *name,
                                         void *locks, void *locks_end,
                                         void *text, void *text_end);
  extern void alternatives_smp_module_del(struct module *mod);
-extern void alternatives_smp_switch(int smp);
+extern void alternatives_enable_smp(void);
  extern int alternatives_text_reserved(void *start, void *end);
  extern bool skip_smp_alternatives;
  #else
@@ -68,7 +68,7 @@ static inline void alternatives_smp_module_add(struct module *mod, char *name,
                                                void *locks, void *locks_end,
                                                void *text, void *text_end) {}
  static inline void alternatives_smp_module_del(struct module *mod) {}
-static inline void alternatives_smp_switch(int smp) {}
+static inline void alternatives_enable_smp(void) {}
  static inline int alternatives_text_reserved(void *start, void *end)
  {
         return 0;
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h

index b0767bc08740594380b6bbc8d734984b54522be4..9a25b522d37799adf0b7a00d898ba30cb70b08b7 100644 (file)
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -3,38 +3,54 @@
  
  #ifdef __ASSEMBLY__
  
-       .macro MCOUNT_SAVE_FRAME
-       /* taken from glibc */
-       subq $0x38, %rsp
-       movq %rax, (%rsp)
-       movq %rcx, 8(%rsp)
-       movq %rdx, 16(%rsp)
-       movq %rsi, 24(%rsp)
-       movq %rdi, 32(%rsp)
-       movq %r8, 40(%rsp)
-       movq %r9, 48(%rsp)
+       /* skip is set if the stack was already partially adjusted */
+       .macro MCOUNT_SAVE_FRAME skip=0
+        /*
+         * We add enough stack to save all regs.
+         */
+       subq $(SS+8-\skip), %rsp
+       movq %rax, RAX(%rsp)
+       movq %rcx, RCX(%rsp)
+       movq %rdx, RDX(%rsp)
+       movq %rsi, RSI(%rsp)
+       movq %rdi, RDI(%rsp)
+       movq %r8, R8(%rsp)
+       movq %r9, R9(%rsp)
+        /* Move RIP to its proper location */
+       movq SS+8(%rsp), %rdx
+       movq %rdx, RIP(%rsp)
         .endm
  
-       .macro MCOUNT_RESTORE_FRAME
-       movq 48(%rsp), %r9
-       movq 40(%rsp), %r8
-       movq 32(%rsp), %rdi
-       movq 24(%rsp), %rsi
-       movq 16(%rsp), %rdx
-       movq 8(%rsp), %rcx
-       movq (%rsp), %rax
-       addq $0x38, %rsp
+       .macro MCOUNT_RESTORE_FRAME skip=0
+       movq R9(%rsp), %r9
+       movq R8(%rsp), %r8
+       movq RDI(%rsp), %rdi
+       movq RSI(%rsp), %rsi
+       movq RDX(%rsp), %rdx
+       movq RCX(%rsp), %rcx
+       movq RAX(%rsp), %rax
+       addq $(SS+8-\skip), %rsp
         .endm
  
  #endif
  
  #ifdef CONFIG_FUNCTION_TRACER
-#define MCOUNT_ADDR            ((long)(mcount))
+#ifdef CC_USING_FENTRY
+# define MCOUNT_ADDR           ((long)(__fentry__))
+#else
+# define MCOUNT_ADDR           ((long)(mcount))
+#endif
  #define MCOUNT_INSN_SIZE       5 /* sizeof mcount call */
  
+#ifdef CONFIG_DYNAMIC_FTRACE
+#define ARCH_SUPPORTS_FTRACE_OPS 1
+#define ARCH_SUPPORTS_FTRACE_SAVE_REGS
+#endif
+
  #ifndef __ASSEMBLY__
  extern void mcount(void);
  extern atomic_t modifying_ftrace_code;
+extern void __fentry__(void);
  
  static inline unsigned long ftrace_call_adjust(unsigned long addr)
  {
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h

index 54788253915739a2a07faa87f8186ab261e0a065..d3ddd17405d07b1e288828a949ac9ecb363c3c2e 100644 (file)
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -27,6 +27,7 @@
  #include <asm/insn.h>
  
  #define  __ARCH_WANT_KPROBES_INSN_SLOT
+#define  ARCH_SUPPORTS_KPROBES_ON_FTRACE
  
  struct pt_regs;
  struct kprobe;
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h

index 4ebe157bf73db184b4e73df8f391db399fd874a7..43d921b4752c839c50e86e9ca2e17b2cd10866ac 100644 (file)
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -15,8 +15,8 @@ struct microcode_ops {
         enum ucode_state (*request_microcode_user) (int cpu,
                                 const void __user *buf, size_t size);
  
-       enum ucode_state (*request_microcode_fw) (int cpu,
-                               struct device *device);
+       enum ucode_state (*request_microcode_fw) (int cpu, struct device *,
+                                                 bool refresh_fw);
  
         void (*microcode_fini_cpu) (int cpu);
  
@@ -49,12 +49,6 @@ static inline struct microcode_ops * __init init_intel_microcode(void)
  #ifdef CONFIG_MICROCODE_AMD
  extern struct microcode_ops * __init init_amd_microcode(void);
  extern void __exit exit_amd_microcode(void);
-
-static inline void get_ucode_data(void *to, const u8 *from, size_t n)
-{
-       memcpy(to, from, n);
-}
-
  #else
  static inline struct microcode_ops * __init init_amd_microcode(void)
  {
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h

index cb4e43bce98ab46e262f3b235765e5ebd6270ddd..4fabcdf1cfa74b6f6c7c04327e43fb5c7c1466ac 100644 (file)
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -262,4 +262,6 @@ static inline void perf_check_microcode(void) { }
   static inline void amd_pmu_disable_virt(void) { }
  #endif
  
+#define arch_perf_out_copy_user copy_from_user_nmi
+
  #endif /* _ASM_X86_PERF_EVENT_H */
diff --git a/arch/x86/include/asm/perf_regs.h b/arch/x86/include/asm/perf_regs.h

new file mode 100644 (file)

index 0000000..3f2207b
--- /dev/null
+++ b/arch/x86/include/asm/perf_regs.h
@@ -0,0 +1,33 @@
+#ifndef _ASM_X86_PERF_REGS_H
+#define _ASM_X86_PERF_REGS_H
+
+enum perf_event_x86_regs {
+       PERF_REG_X86_AX,
+       PERF_REG_X86_BX,
+       PERF_REG_X86_CX,
+       PERF_REG_X86_DX,
+       PERF_REG_X86_SI,
+       PERF_REG_X86_DI,
+       PERF_REG_X86_BP,
+       PERF_REG_X86_SP,
+       PERF_REG_X86_IP,
+       PERF_REG_X86_FLAGS,
+       PERF_REG_X86_CS,
+       PERF_REG_X86_SS,
+       PERF_REG_X86_DS,
+       PERF_REG_X86_ES,
+       PERF_REG_X86_FS,
+       PERF_REG_X86_GS,
+       PERF_REG_X86_R8,
+       PERF_REG_X86_R9,
+       PERF_REG_X86_R10,
+       PERF_REG_X86_R11,
+       PERF_REG_X86_R12,
+       PERF_REG_X86_R13,
+       PERF_REG_X86_R14,
+       PERF_REG_X86_R15,
+
+       PERF_REG_X86_32_MAX = PERF_REG_X86_GS + 1,
+       PERF_REG_X86_64_MAX = PERF_REG_X86_R15 + 1,
+};
+#endif /* _ASM_X86_PERF_REGS_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile

index 8215e5652d9747b6a7eb5302abca77446933acf3..8d7a619718b5fac1b245985cbc185c108c5a4a94 100644 (file)
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -100,6 +100,8 @@ obj-$(CONFIG_SWIOTLB)                       += pci-swiotlb.o
  obj-$(CONFIG_OF)                       += devicetree.o
  obj-$(CONFIG_UPROBES)                  += uprobes.o
  
+obj-$(CONFIG_PERF_EVENTS)              += perf_regs.o
+
  ###
  # 64 bit specific files
  ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c

index ced4534baed574f7596b014f979748b1eedc2653..357475a87b52edb961af50198bb327d59b3b5a5f 100644 (file)
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -23,19 +23,6 @@
  
  #define MAX_PATCH_LEN (255-1)
  
-#ifdef CONFIG_HOTPLUG_CPU
-static int smp_alt_once;
-
-static int __init bootonly(char *str)
-{
-       smp_alt_once = 1;
-       return 1;
-}
-__setup("smp-alt-boot", bootonly);
-#else
-#define smp_alt_once 1
-#endif
-
  static int __initdata_or_module debug_alternative;
  
  static int __init debug_alt(char *str)
@@ -326,9 +313,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
  {
         const s32 *poff;
  
-       if (noreplace_smp)
-               return;
-
         mutex_lock(&text_mutex);
         for (poff = start; poff < end; poff++) {
                 u8 *ptr = (u8 *)poff + *poff;
@@ -359,7 +343,7 @@ struct smp_alt_module {
  };
  static LIST_HEAD(smp_alt_modules);
  static DEFINE_MUTEX(smp_alt);
-static int smp_mode = 1;       /* protected by smp_alt */
+static bool uniproc_patched = false;   /* protected by smp_alt */
  
  void __init_or_module alternatives_smp_module_add(struct module *mod,
                                                   char *name,
@@ -368,19 +352,18 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
  {
         struct smp_alt_module *smp;
  
-       if (noreplace_smp)
-               return;
+       mutex_lock(&smp_alt);
+       if (!uniproc_patched)
+               goto unlock;
  
-       if (smp_alt_once) {
-               if (boot_cpu_has(X86_FEATURE_UP))
-                       alternatives_smp_unlock(locks, locks_end,
-                                               text, text_end);
-               return;
-       }
+       if (num_possible_cpus() == 1)
+               /* Don't bother remembering, we'll never have to undo it. */
+               goto smp_unlock;
  
         smp = kzalloc(sizeof(*smp), GFP_KERNEL);
         if (NULL == smp)
-               return; /* we'll run the (safe but slow) SMP code then ... */
+               /* we'll run the (safe but slow) SMP code then ... */
+               goto unlock;
  
         smp->mod        = mod;
         smp->name       = name;
@@ -392,11 +375,10 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
                 __func__, smp->locks, smp->locks_end,
                 smp->text, smp->text_end, smp->name);
  
-       mutex_lock(&smp_alt);
         list_add_tail(&smp->next, &smp_alt_modules);
-       if (boot_cpu_has(X86_FEATURE_UP))
-               alternatives_smp_unlock(smp->locks, smp->locks_end,
-                                       smp->text, smp->text_end);
+smp_unlock:
+       alternatives_smp_unlock(locks, locks_end, text, text_end);
+unlock:
         mutex_unlock(&smp_alt);
  }
  
@@ -404,24 +386,18 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
  {
         struct smp_alt_module *item;
  
-       if (smp_alt_once || noreplace_smp)
-               return;
-
         mutex_lock(&smp_alt);
         list_for_each_entry(item, &smp_alt_modules, next) {
                 if (mod != item->mod)
                         continue;
                 list_del(&item->next);
-               mutex_unlock(&smp_alt);
-               DPRINTK("%s: %s\n", __func__, item->name);
                 kfree(item);
-               return;
+               break;
         }
         mutex_unlock(&smp_alt);
  }
  
-bool skip_smp_alternatives;
-void alternatives_smp_switch(int smp)
+void alternatives_enable_smp(void)
  {
         struct smp_alt_module *mod;
  
@@ -436,34 +412,21 @@ void alternatives_smp_switch(int smp)
         pr_info("lockdep: fixing up alternatives\n");
  #endif
  
-       if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
-               return;
-       BUG_ON(!smp && (num_online_cpus() > 1));
+       /* Why bother if there are no other CPUs? */
+       BUG_ON(num_possible_cpus() == 1);
  
         mutex_lock(&smp_alt);
  
-       /*
-        * Avoid unnecessary switches because it forces JIT based VMs to
-        * throw away all cached translations, which can be quite costly.
-        */
-       if (smp == smp_mode) {
-               /* nothing */
-       } else if (smp) {
+       if (uniproc_patched) {
                 pr_info("switching to SMP code\n");
+               BUG_ON(num_online_cpus() != 1);
                 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
                 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
                 list_for_each_entry(mod, &smp_alt_modules, next)
                         alternatives_smp_lock(mod->locks, mod->locks_end,
                                               mod->text, mod->text_end);
-       } else {
-               pr_info("switching to UP code\n");
-               set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
-               set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
-               list_for_each_entry(mod, &smp_alt_modules, next)
-                       alternatives_smp_unlock(mod->locks, mod->locks_end,
-                                               mod->text, mod->text_end);
+               uniproc_patched = false;
         }
-       smp_mode = smp;
         mutex_unlock(&smp_alt);
  }
  
@@ -540,40 +503,22 @@ void __init alternative_instructions(void)
  
         apply_alternatives(__alt_instructions, __alt_instructions_end);
  
-       /* switch to patch-once-at-boottime-only mode and free the
-        * tables in case we know the number of CPUs will never ever
-        * change */
-#ifdef CONFIG_HOTPLUG_CPU
-       if (num_possible_cpus() < 2)
-               smp_alt_once = 1;
-#endif
-
  #ifdef CONFIG_SMP
-       if (smp_alt_once) {
-               if (1 == num_possible_cpus()) {
-                       pr_info("switching to UP code\n");
-                       set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
-                       set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
-
-                       alternatives_smp_unlock(__smp_locks, __smp_locks_end,
-                                               _text, _etext);
-               }
-       } else {
+       /* Patch to UP if other cpus not imminent. */
+       if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
+               uniproc_patched = true;
                 alternatives_smp_module_add(NULL, "core kernel",
                                             __smp_locks, __smp_locks_end,
                                             _text, _etext);
-
-               /* Only switch to UP mode if we don't immediately boot others */
-               if (num_present_cpus() == 1 || setup_max_cpus <= 1)
-                       alternatives_smp_switch(0);
         }
-#endif
-       apply_paravirt(__parainstructions, __parainstructions_end);
  
-       if (smp_alt_once)
+       if (!uniproc_patched || num_possible_cpus() == 1)
                 free_init_pages("SMP alternatives",
                                 (unsigned long)__smp_locks,
                                 (unsigned long)__smp_locks_end);
+#endif
+
+       apply_paravirt(__parainstructions, __parainstructions_end);
  
         restart_nmi();
  }
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c

index 9d92e19039f05f84b4c172de7c1c243a4dad72e0..f7e98a2c0d123ae0ebe7f61f1521f7dd7090a335 100644 (file)
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -737,6 +737,72 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c,
  }
  #endif
  
+static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)
+{
+       if (!cpu_has_invlpg)
+               return;
+
+       tlb_flushall_shift = 5;
+
+       if (c->x86 <= 0x11)
+               tlb_flushall_shift = 4;
+}
+
+static void __cpuinit cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
+{
+       u32 ebx, eax, ecx, edx;
+       u16 mask = 0xfff;
+
+       if (c->x86 < 0xf)
+               return;
+
+       if (c->extended_cpuid_level < 0x80000006)
+               return;
+
+       cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
+
+       tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
+       tlb_lli_4k[ENTRIES] = ebx & mask;
+
+       /*
+        * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB
+        * characteristics from the CPUID function 0x80000005 instead.
+        */
+       if (c->x86 == 0xf) {
+               cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+               mask = 0xff;
+       }
+
+       /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+       if (!((eax >> 16) & mask)) {
+               u32 a, b, c, d;
+
+               cpuid(0x80000005, &a, &b, &c, &d);
+               tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff;
+       } else {
+               tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
+       }
+
+       /* a 4M entry uses two 2M entries */
+       tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
+
+       /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+       if (!(eax & mask)) {
+               /* Erratum 658 */
+               if (c->x86 == 0x15 && c->x86_model <= 0x1f) {
+                       tlb_lli_2m[ENTRIES] = 1024;
+               } else {
+                       cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+                       tlb_lli_2m[ENTRIES] = eax & 0xff;
+               }
+       } else
+               tlb_lli_2m[ENTRIES] = eax & mask;
+
+       tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+
+       cpu_set_tlb_flushall_shift(c);
+}
+
  static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
         .c_vendor       = "AMD",
         .c_ident        = { "AuthenticAMD" },
@@ -756,6 +822,7 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
         .c_size_cache   = amd_size_cache,
  #endif
         .c_early_init   = early_init_amd,
+       .c_detect_tlb   = cpu_detect_tlb_amd,
         .c_bsp_init     = bsp_init_amd,
         .c_init         = init_amd,
         .c_x86_vendor   = X86_VENDOR_AMD,
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

index a5fbc3c5fccc5e60d61d9ec8f1d3f9f60a2a0467..2ea24da6f7fbbb5b59d1b8782d2c0436a70c21cd 100644 (file)
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -476,7 +476,7 @@ void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c)
  
         printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
                 "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n"          \
-               "tlb_flushall_shift is 0x%x\n",
+               "tlb_flushall_shift: %d\n",
                 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
                 tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
                 tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
@@ -942,8 +942,7 @@ void __init identify_boot_cpu(void)
  #else
         vgetcpu_set_mode();
  #endif
-       if (boot_cpu_data.cpuid_level >= 2)
-               cpu_detect_tlb(&boot_cpu_data);
+       cpu_detect_tlb(&boot_cpu_data);
  }
  
  void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c

index 0a4ce2980a5a33e90bea05599a015a0c1381bf23..198e019a531af8f26c9c094560d87b325998428b 100644 (file)
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -648,6 +648,10 @@ static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c)
         int i, j, n;
         unsigned int regs[4];
         unsigned char *desc = (unsigned char *)regs;
+
+       if (c->cpuid_level < 2)
+               return;
+
         /* Number of times to iterate */
         n = cpuid_eax(2) & 0xFF;
  
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c

index fc4beb3935771eab1b404b3333cdb7842dac7c5c..ddc72f8393321de0ca989d2476f7ccd96eed3639 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -78,6 +78,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs)
  }
  
  static cpumask_var_t mce_inject_cpumask;
+static DEFINE_MUTEX(mce_inject_mutex);
  
  static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
  {
@@ -194,7 +195,11 @@ static void raise_mce(struct mce *m)
                 put_online_cpus();
         } else
  #endif
+       {
+               preempt_disable();
                 raise_local();
+               preempt_enable();
+       }
  }
  
  /* Error injection interface */
@@ -225,7 +230,10 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
          * so do it a jiffie or two later everywhere.
          */
         schedule_timeout(2);
+
+       mutex_lock(&mce_inject_mutex);
         raise_mce(&m);
+       mutex_unlock(&mce_inject_mutex);
         return usize;
  }
  
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h

index ed44c8a65858623b64b313fe17b9e9f9d88a28bc..6a05c1d327a9627819729c83a762f2c3fe9bc923 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -28,6 +28,18 @@ extern int mce_ser;
  
  extern struct mce_bank *mce_banks;
  
+#ifdef CONFIG_X86_MCE_INTEL
+unsigned long mce_intel_adjust_timer(unsigned long interval);
+void mce_intel_cmci_poll(void);
+void mce_intel_hcpu_update(unsigned long cpu);
+#else
+# define mce_intel_adjust_timer mce_adjust_timer_default
+static inline void mce_intel_cmci_poll(void) { }
+static inline void mce_intel_hcpu_update(unsigned long cpu) { }
+#endif
+
+void mce_timer_kick(unsigned long interval);
+
  #ifdef CONFIG_ACPI_APEI
  int apei_write_mce(struct mce *m);
  ssize_t apei_read_mce(struct mce *m, u64 *record_id);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c

index 292d0258311c82d04c5ec0aeab43924d00c669b4..c311122ea838301781d8d5e41723a37cf0e68dde 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1266,6 +1266,14 @@ static unsigned long check_interval = 5 * 60; /* 5 minutes */
  static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
  static DEFINE_PER_CPU(struct timer_list, mce_timer);
  
+static unsigned long mce_adjust_timer_default(unsigned long interval)
+{
+       return interval;
+}
+
+static unsigned long (*mce_adjust_timer)(unsigned long interval) =
+       mce_adjust_timer_default;
+
  static void mce_timer_fn(unsigned long data)
  {
         struct timer_list *t = &__get_cpu_var(mce_timer);
@@ -1276,6 +1284,7 @@ static void mce_timer_fn(unsigned long data)
         if (mce_available(__this_cpu_ptr(&cpu_info))) {
                 machine_check_poll(MCP_TIMESTAMP,
                                 &__get_cpu_var(mce_poll_banks));
+               mce_intel_cmci_poll();
         }
  
         /*
@@ -1283,14 +1292,38 @@ static void mce_timer_fn(unsigned long data)
          * polling interval, otherwise increase the polling interval.
          */
         iv = __this_cpu_read(mce_next_interval);
-       if (mce_notify_irq())
+       if (mce_notify_irq()) {
                 iv = max(iv / 2, (unsigned long) HZ/100);
-       else
+       } else {
                 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+               iv = mce_adjust_timer(iv);
+       }
         __this_cpu_write(mce_next_interval, iv);
+       /* Might have become 0 after CMCI storm subsided */
+       if (iv) {
+               t->expires = jiffies + iv;
+               add_timer_on(t, smp_processor_id());
+       }
+}
  
-       t->expires = jiffies + iv;
-       add_timer_on(t, smp_processor_id());
+/*
+ * Ensure that the timer is firing in @interval from now.
+ */
+void mce_timer_kick(unsigned long interval)
+{
+       struct timer_list *t = &__get_cpu_var(mce_timer);
+       unsigned long when = jiffies + interval;
+       unsigned long iv = __this_cpu_read(mce_next_interval);
+
+       if (timer_pending(t)) {
+               if (time_before(when, t->expires))
+                       mod_timer_pinned(t, when);
+       } else {
+               t->expires = round_jiffies(when);
+               add_timer_on(t, smp_processor_id());
+       }
+       if (interval < iv)
+               __this_cpu_write(mce_next_interval, interval);
  }
  
  /* Must not be called in IRQ context where del_timer_sync() can deadlock */
@@ -1585,6 +1618,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
         switch (c->x86_vendor) {
         case X86_VENDOR_INTEL:
                 mce_intel_feature_init(c);
+               mce_adjust_timer = mce_intel_adjust_timer;
                 break;
         case X86_VENDOR_AMD:
                 mce_amd_feature_init(c);
@@ -1594,23 +1628,28 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
         }
  }
  
-static void __mcheck_cpu_init_timer(void)
+static void mce_start_timer(unsigned int cpu, struct timer_list *t)
  {
-       struct timer_list *t = &__get_cpu_var(mce_timer);
-       unsigned long iv = check_interval * HZ;
+       unsigned long iv = mce_adjust_timer(check_interval * HZ);
  
-       setup_timer(t, mce_timer_fn, smp_processor_id());
+       __this_cpu_write(mce_next_interval, iv);
  
-       if (mce_ignore_ce)
+       if (mce_ignore_ce || !iv)
                 return;
  
-       __this_cpu_write(mce_next_interval, iv);
-       if (!iv)
-               return;
         t->expires = round_jiffies(jiffies + iv);
         add_timer_on(t, smp_processor_id());
  }
  
+static void __mcheck_cpu_init_timer(void)
+{
+       struct timer_list *t = &__get_cpu_var(mce_timer);
+       unsigned int cpu = smp_processor_id();
+
+       setup_timer(t, mce_timer_fn, cpu);
+       mce_start_timer(cpu, t);
+}
+
  /* Handle unconfigured int18 (should never happen) */
  static void unexpected_machine_check(struct pt_regs *regs, long error_code)
  {
@@ -2294,38 +2333,33 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
         unsigned int cpu = (unsigned long)hcpu;
         struct timer_list *t = &per_cpu(mce_timer, cpu);
  
-       switch (action) {
+       switch (action & ~CPU_TASKS_FROZEN) {
         case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
                 mce_device_create(cpu);
                 if (threshold_cpu_callback)
                         threshold_cpu_callback(action, cpu);
                 break;
         case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
                 if (threshold_cpu_callback)
                         threshold_cpu_callback(action, cpu);
                 mce_device_remove(cpu);
+               mce_intel_hcpu_update(cpu);
                 break;
         case CPU_DOWN_PREPARE:
-       case CPU_DOWN_PREPARE_FROZEN:
-               del_timer_sync(t);
                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+               del_timer_sync(t);
                 break;
         case CPU_DOWN_FAILED:
-       case CPU_DOWN_FAILED_FROZEN:
-               if (!mce_ignore_ce && check_interval) {
-                       t->expires = round_jiffies(jiffies +
-                                       per_cpu(mce_next_interval, cpu));
-                       add_timer_on(t, cpu);
-               }
                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+               mce_start_timer(cpu, t);
                 break;
-       case CPU_POST_DEAD:
+       }
+
+       if (action == CPU_POST_DEAD) {
                 /* intentionally ignoring frozen here */
                 cmci_rediscover(cpu);
-               break;
         }
+
         return NOTIFY_OK;
  }
  
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c

index 38e49bc95ffcc5eba26631a7ac59618b1dafc660..098386fed48e82d239d0061fb704c2ad790b372c 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -15,6 +15,8 @@
  #include <asm/msr.h>
  #include <asm/mce.h>
  
+#include "mce-internal.h"
+
  /*
   * Support for Intel Correct Machine Check Interrupts. This allows
   * the CPU to raise an interrupt when a corrected machine check happened.
@@ -30,7 +32,22 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
   */
  static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
  
-#define CMCI_THRESHOLD 1
+#define CMCI_THRESHOLD         1
+#define CMCI_POLL_INTERVAL     (30 * HZ)
+#define CMCI_STORM_INTERVAL    (1 * HZ)
+#define CMCI_STORM_THRESHOLD   15
+
+static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
+static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
+static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
+
+enum {
+       CMCI_STORM_NONE,
+       CMCI_STORM_ACTIVE,
+       CMCI_STORM_SUBSIDED,
+};
+
+static atomic_t cmci_storm_on_cpus;
  
  static int cmci_supported(int *banks)
  {
@@ -53,6 +70,93 @@ static int cmci_supported(int *banks)
         return !!(cap & MCG_CMCI_P);
  }
  
+void mce_intel_cmci_poll(void)
+{
+       if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
+               return;
+       machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+}
+
+void mce_intel_hcpu_update(unsigned long cpu)
+{
+       if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
+               atomic_dec(&cmci_storm_on_cpus);
+
+       per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
+}
+
+unsigned long mce_intel_adjust_timer(unsigned long interval)
+{
+       int r;
+
+       if (interval < CMCI_POLL_INTERVAL)
+               return interval;
+
+       switch (__this_cpu_read(cmci_storm_state)) {
+       case CMCI_STORM_ACTIVE:
+               /*
+                * We switch back to interrupt mode once the poll timer has
+                * silenced itself. That means no events recorded and the
+                * timer interval is back to our poll interval.
+                */
+               __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
+               r = atomic_sub_return(1, &cmci_storm_on_cpus);
+               if (r == 0)
+                       pr_notice("CMCI storm subsided: switching to interrupt mode\n");
+               /* FALLTHROUGH */
+
+       case CMCI_STORM_SUBSIDED:
+               /*
+                * We wait for all cpus to go back to SUBSIDED
+                * state. When that happens we switch back to
+                * interrupt mode.
+                */
+               if (!atomic_read(&cmci_storm_on_cpus)) {
+                       __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
+                       cmci_reenable();
+                       cmci_recheck();
+               }
+               return CMCI_POLL_INTERVAL;
+       default:
+               /*
+                * We have shiny weather. Let the poll do whatever it
+                * thinks.
+                */
+               return interval;
+       }
+}
+
+static bool cmci_storm_detect(void)
+{
+       unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
+       unsigned long ts = __this_cpu_read(cmci_time_stamp);
+       unsigned long now = jiffies;
+       int r;
+
+       if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
+               return true;
+
+       if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
+               cnt++;
+       } else {
+               cnt = 1;
+               __this_cpu_write(cmci_time_stamp, now);
+       }
+       __this_cpu_write(cmci_storm_cnt, cnt);
+
+       if (cnt <= CMCI_STORM_THRESHOLD)
+               return false;
+
+       cmci_clear();
+       __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
+       r = atomic_add_return(1, &cmci_storm_on_cpus);
+       mce_timer_kick(CMCI_POLL_INTERVAL);
+
+       if (r == 1)
+               pr_notice("CMCI storm detected: switching to poll mode\n");
+       return true;
+}
+
  /*
   * The interrupt handler. This is called on every event.
   * Just call the poller directly to log any events.
@@ -61,28 +165,21 @@ static int cmci_supported(int *banks)
   */
  static void intel_threshold_interrupt(void)
  {
+       if (cmci_storm_detect())
+               return;
         machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
         mce_notify_irq();
  }
  
-static void print_update(char *type, int *hdr, int num)
-{
-       if (*hdr == 0)
-               printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
-       *hdr = 1;
-       printk(KERN_CONT " %s:%d", type, num);
-}
-
  /*
   * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
   * on this CPU. Use the algorithm recommended in the SDM to discover shared
   * banks.
   */
-static void cmci_discover(int banks, int boot)
+static void cmci_discover(int banks)
  {
         unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
         unsigned long flags;
-       int hdr = 0;
         int i;
  
         raw_spin_lock_irqsave(&cmci_discover_lock, flags);
@@ -96,8 +193,7 @@ static void cmci_discover(int banks, int boot)
  
                 /* Already owned by someone else? */
                 if (val & MCI_CTL2_CMCI_EN) {
-                       if (test_and_clear_bit(i, owned) && !boot)
-                               print_update("SHD", &hdr, i);
+                       clear_bit(i, owned);
                         __clear_bit(i, __get_cpu_var(mce_poll_banks));
                         continue;
                 }
@@ -109,16 +205,13 @@ static void cmci_discover(int banks, int boot)
  
                 /* Did the enable bit stick? -- the bank supports CMCI */
                 if (val & MCI_CTL2_CMCI_EN) {
-                       if (!test_and_set_bit(i, owned) && !boot)
-                               print_update("CMCI", &hdr, i);
+                       set_bit(i, owned);
                         __clear_bit(i, __get_cpu_var(mce_poll_banks));
                 } else {
                         WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
                 }
         }
         raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
-       if (hdr)
-               printk(KERN_CONT "\n");
  }
  
  /*
@@ -186,7 +279,7 @@ void cmci_rediscover(int dying)
                         continue;
                 /* Recheck banks in case CPUs don't all have the same */
                 if (cmci_supported(&banks))
-                       cmci_discover(banks, 0);
+                       cmci_discover(banks);
         }
  
         set_cpus_allowed_ptr(current, old);
@@ -200,7 +293,7 @@ void cmci_reenable(void)
  {
         int banks;
         if (cmci_supported(&banks))
-               cmci_discover(banks, 0);
+               cmci_discover(banks);
  }
  
  static void intel_init_cmci(void)
@@ -211,7 +304,7 @@ static void intel_init_cmci(void)
                 return;
  
         mce_threshold_vector = intel_threshold_interrupt;
-       cmci_discover(banks, 1);
+       cmci_discover(banks);
         /*
          * For CPU #0 this runs with still disabled APIC, but that's
          * ok because only the vector is set up. We still do another
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c

index 7f2739e03e79a80fc1baaf203cf3a22eccec54dc..0d3d63afa76abd304cb7809461152ce97e3f0828 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2008,6 +2008,7 @@ __init int intel_pmu_init(void)
                 break;
  
         case 28: /* Atom */
+       case 54: /* Cedariew */
                 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
                        sizeof(hw_cache_event_ids));
  
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c

index 520b4265fcd215ee5afe240fe11c944dd6bc06aa..da02e9cc3754b4a2c1a37c1edb44865143f7f723 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -686,7 +686,8 @@ void intel_pmu_lbr_init_atom(void)
          * to have an operational LBR which can freeze
          * on PMU interrupt
          */
-       if (boot_cpu_data.x86_mask < 10) {
+       if (boot_cpu_data.x86_model == 28
+           && boot_cpu_data.x86_mask < 10) {
                 pr_cont("LBR disabled due to erratum");
                 return;
         }
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c

index 3ae2ced4a874ac930c0fd6563820dc3a482d75d2..b1581527a236a2f1b0d9657f14f13b471f7725fb 100644 (file)
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -342,6 +342,47 @@ const struct irq_domain_ops ioapic_irq_domain_ops = {
         .xlate = ioapic_xlate,
  };
  
+static void dt_add_ioapic_domain(unsigned int ioapic_num,
+               struct device_node *np)
+{
+       struct irq_domain *id;
+       struct mp_ioapic_gsi *gsi_cfg;
+       int ret;
+       int num;
+
+       gsi_cfg = mp_ioapic_gsi_routing(ioapic_num);
+       num = gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
+
+       id = irq_domain_add_linear(np, num, &ioapic_irq_domain_ops,
+                       (void *)ioapic_num);
+       BUG_ON(!id);
+       if (gsi_cfg->gsi_base == 0) {
+               /*
+                * The first NR_IRQS_LEGACY irq descs are allocated in
+                * early_irq_init() and need just a mapping. The
+                * remaining irqs need both. All of them are preallocated
+                * and assigned so we can keep the 1:1 mapping which the ioapic
+                * is having.
+                */
+               ret = irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY);
+               if (ret)
+                       pr_err("Error mapping legacy IRQs: %d\n", ret);
+
+               if (num > NR_IRQS_LEGACY) {
+                       ret = irq_create_strict_mappings(id, NR_IRQS_LEGACY,
+                                       NR_IRQS_LEGACY, num - NR_IRQS_LEGACY);
+                       if (ret)
+                               pr_err("Error creating mapping for the "
+                                               "remaining IRQs: %d\n", ret);
+               }
+               irq_set_default_host(id);
+       } else {
+               ret = irq_create_strict_mappings(id, gsi_cfg->gsi_base, 0, num);
+               if (ret)
+                       pr_err("Error creating IRQ mapping: %d\n", ret);
+       }
+}
+
  static void __init ioapic_add_ofnode(struct device_node *np)
  {
         struct resource r;
@@ -356,15 +397,7 @@ static void __init ioapic_add_ofnode(struct device_node *np)
  
         for (i = 0; i < nr_ioapics; i++) {
                 if (r.start == mpc_ioapic_addr(i)) {
-                       struct irq_domain *id;
-                       struct mp_ioapic_gsi *gsi_cfg;
-
-                       gsi_cfg = mp_ioapic_gsi_routing(i);
-
-                       id = irq_domain_add_legacy(np, 32, gsi_cfg->gsi_base, 0,
-                                                  &ioapic_irq_domain_ops,
-                                                  (void*)i);
-                       BUG_ON(!id);
+                       dt_add_ioapic_domain(i, np);
                         return;
                 }
         }
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S

index 623f288374763286ec9e58aa66ecabb4e3fc2f6b..061ac17ee974ae78fed3e4238aa331d6d828f61f 100644 (file)
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1109,17 +1109,21 @@ ENTRY(ftrace_caller)
         pushl %eax
         pushl %ecx
         pushl %edx
-       movl 0xc(%esp), %eax
+       pushl $0        /* Pass NULL as regs pointer */
+       movl 4*4(%esp), %eax
         movl 0x4(%ebp), %edx
+       leal function_trace_op, %ecx
         subl $MCOUNT_INSN_SIZE, %eax
  
  .globl ftrace_call
  ftrace_call:
         call ftrace_stub
  
+       addl $4,%esp    /* skip NULL pointer */
         popl %edx
         popl %ecx
         popl %eax
+ftrace_ret:
  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
  .globl ftrace_graph_call
  ftrace_graph_call:
@@ -1131,6 +1135,72 @@ ftrace_stub:
         ret
  END(ftrace_caller)
  
+ENTRY(ftrace_regs_caller)
+       pushf   /* push flags before compare (in cs location) */
+       cmpl $0, function_trace_stop
+       jne ftrace_restore_flags
+
+       /*
+        * i386 does not save SS and ESP when coming from kernel.
+        * Instead, to get sp, &regs->sp is used (see ptrace.h).
+        * Unfortunately, that means eflags must be at the same location
+        * as the current return ip is. We move the return ip into the
+        * ip location, and move flags into the return ip location.
+        */
+       pushl 4(%esp)   /* save return ip into ip slot */
+       subl $MCOUNT_INSN_SIZE, (%esp)  /* Adjust ip */
+
+       pushl $0        /* Load 0 into orig_ax */
+       pushl %gs
+       pushl %fs
+       pushl %es
+       pushl %ds
+       pushl %eax
+       pushl %ebp
+       pushl %edi
+       pushl %esi
+       pushl %edx
+       pushl %ecx
+       pushl %ebx
+
+       movl 13*4(%esp), %eax   /* Get the saved flags */
+       movl %eax, 14*4(%esp)   /* Move saved flags into regs->flags location */
+                               /* clobbering return ip */
+       movl $__KERNEL_CS,13*4(%esp)
+
+       movl 12*4(%esp), %eax   /* Load ip (1st parameter) */
+       movl 0x4(%ebp), %edx    /* Load parent ip (2nd parameter) */
+       leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
+       pushl %esp              /* Save pt_regs as 4th parameter */
+
+GLOBAL(ftrace_regs_call)
+       call ftrace_stub
+
+       addl $4, %esp           /* Skip pt_regs */
+       movl 14*4(%esp), %eax   /* Move flags back into cs */
+       movl %eax, 13*4(%esp)   /* Needed to keep addl from modifying flags */
+       movl 12*4(%esp), %eax   /* Get return ip from regs->ip */
+       addl $MCOUNT_INSN_SIZE, %eax
+       movl %eax, 14*4(%esp)   /* Put return ip back for ret */
+
+       popl %ebx
+       popl %ecx
+       popl %edx
+       popl %esi
+       popl %edi
+       popl %ebp
+       popl %eax
+       popl %ds
+       popl %es
+       popl %fs
+       popl %gs
+       addl $8, %esp           /* Skip orig_ax and ip */
+       popf                    /* Pop flags at end (no addl to corrupt flags) */
+       jmp ftrace_ret
+
+ftrace_restore_flags:
+       popf
+       jmp  ftrace_stub
  #else /* ! CONFIG_DYNAMIC_FTRACE */
  
  ENTRY(mcount)
@@ -1171,9 +1241,6 @@ END(mcount)
  
  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
  ENTRY(ftrace_graph_caller)
-       cmpl $0, function_trace_stop
-       jne ftrace_stub
-
         pushl %eax
         pushl %ecx
         pushl %edx
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S

index 69babd8c834f920b4d54c48e1f41a08d4f7fef6f..ed767b747fe5e731c38e92fd872cd71a137adc55 100644 (file)
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -68,25 +68,51 @@
         .section .entry.text, "ax"
  
  #ifdef CONFIG_FUNCTION_TRACER
+
+#ifdef CC_USING_FENTRY
+# define function_hook __fentry__
+#else
+# define function_hook mcount
+#endif
+
  #ifdef CONFIG_DYNAMIC_FTRACE
-ENTRY(mcount)
+
+ENTRY(function_hook)
         retq
-END(mcount)
+END(function_hook)
+
+/* skip is set if stack has been adjusted */
+.macro ftrace_caller_setup skip=0
+       MCOUNT_SAVE_FRAME \skip
+
+       /* Load the ftrace_ops into the 3rd parameter */
+       leaq function_trace_op, %rdx
+
+       /* Load ip into the first parameter */
+       movq RIP(%rsp), %rdi
+       subq $MCOUNT_INSN_SIZE, %rdi
+       /* Load the parent_ip into the second parameter */
+#ifdef CC_USING_FENTRY
+       movq SS+16(%rsp), %rsi
+#else
+       movq 8(%rbp), %rsi
+#endif
+.endm
  
  ENTRY(ftrace_caller)
+       /* Check if tracing was disabled (quick check) */
         cmpl $0, function_trace_stop
         jne  ftrace_stub
  
-       MCOUNT_SAVE_FRAME
-
-       movq 0x38(%rsp), %rdi
-       movq 8(%rbp), %rsi
-       subq $MCOUNT_INSN_SIZE, %rdi
+       ftrace_caller_setup
+       /* regs go into 4th parameter (but make it NULL) */
+       movq $0, %rcx
  
  GLOBAL(ftrace_call)
         call ftrace_stub
  
         MCOUNT_RESTORE_FRAME
+ftrace_return:
  
  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
  GLOBAL(ftrace_graph_call)
@@ -97,8 +123,74 @@ GLOBAL(ftrace_stub)
         retq
  END(ftrace_caller)
  
+ENTRY(ftrace_regs_caller)
+       /* Save the current flags before compare (in SS location)*/
+       pushfq
+
+       /* Check if tracing was disabled (quick check) */
+       cmpl $0, function_trace_stop
+       jne  ftrace_restore_flags
+
+       /* skip=8 to skip flags saved in SS */
+       ftrace_caller_setup 8
+
+       /* Save the rest of pt_regs */
+       movq %r15, R15(%rsp)
+       movq %r14, R14(%rsp)
+       movq %r13, R13(%rsp)
+       movq %r12, R12(%rsp)
+       movq %r11, R11(%rsp)
+       movq %r10, R10(%rsp)
+       movq %rbp, RBP(%rsp)
+       movq %rbx, RBX(%rsp)
+       /* Copy saved flags */
+       movq SS(%rsp), %rcx
+       movq %rcx, EFLAGS(%rsp)
+       /* Kernel segments */
+       movq $__KERNEL_DS, %rcx
+       movq %rcx, SS(%rsp)
+       movq $__KERNEL_CS, %rcx
+       movq %rcx, CS(%rsp)
+       /* Stack - skipping return address */
+       leaq SS+16(%rsp), %rcx
+       movq %rcx, RSP(%rsp)
+
+       /* regs go into 4th parameter */
+       leaq (%rsp), %rcx
+
+GLOBAL(ftrace_regs_call)
+       call ftrace_stub
+
+       /* Copy flags back to SS, to restore them */
+       movq EFLAGS(%rsp), %rax
+       movq %rax, SS(%rsp)
+
+       /* restore the rest of pt_regs */
+       movq R15(%rsp), %r15
+       movq R14(%rsp), %r14
+       movq R13(%rsp), %r13
+       movq R12(%rsp), %r12
+       movq R10(%rsp), %r10
+       movq RBP(%rsp), %rbp
+       movq RBX(%rsp), %rbx
+
+       /* skip=8 to skip flags saved in SS */
+       MCOUNT_RESTORE_FRAME 8
+
+       /* Restore flags */
+       popfq
+
+       jmp ftrace_return
+ftrace_restore_flags:
+       popfq
+       jmp  ftrace_stub
+
+END(ftrace_regs_caller)
+
+
  #else /* ! CONFIG_DYNAMIC_FTRACE */
-ENTRY(mcount)
+
+ENTRY(function_hook)
         cmpl $0, function_trace_stop
         jne  ftrace_stub
  
@@ -119,8 +211,12 @@ GLOBAL(ftrace_stub)
  trace:
         MCOUNT_SAVE_FRAME
  
-       movq 0x38(%rsp), %rdi
+       movq RIP(%rsp), %rdi
+#ifdef CC_USING_FENTRY
+       movq SS+16(%rsp), %rsi
+#else
         movq 8(%rbp), %rsi
+#endif
         subq $MCOUNT_INSN_SIZE, %rdi
  
         call   *ftrace_trace_function
@@ -128,20 +224,22 @@ trace:
         MCOUNT_RESTORE_FRAME
  
         jmp ftrace_stub
-END(mcount)
+END(function_hook)
  #endif /* CONFIG_DYNAMIC_FTRACE */
  #endif /* CONFIG_FUNCTION_TRACER */
  
  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
  ENTRY(ftrace_graph_caller)
-       cmpl $0, function_trace_stop
-       jne ftrace_stub
-
         MCOUNT_SAVE_FRAME
  
+#ifdef CC_USING_FENTRY
+       leaq SS+16(%rsp), %rdi
+       movq $0, %rdx   /* No framepointers needed */
+#else
         leaq 8(%rbp), %rdi
-       movq 0x38(%rsp), %rsi
         movq (%rbp), %rdx
+#endif
+       movq RIP(%rsp), %rsi
         subq $MCOUNT_INSN_SIZE, %rsi
  
         call    prepare_ftrace_return
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c

index c3a7cb4bf6e6f0f429495d8f4a9478ac9161c0d6..1d414029f1d800ff67ae21bd2e74794ad06efa32 100644 (file)
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -206,6 +206,21 @@ static int
  ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
                    unsigned const char *new_code);
  
+/*
+ * Should never be called:
+ *  As it is only called by __ftrace_replace_code() which is called by
+ *  ftrace_replace_code() that x86 overrides, and by ftrace_update_code()
+ *  which is called to turn mcount into nops or nops into function calls
+ *  but not to convert a function from not using regs to one that uses
+ *  regs, which ftrace_modify_call() is for.
+ */
+int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+                                unsigned long addr)
+{
+       WARN_ON(1);
+       return -EINVAL;
+}
+
  int ftrace_update_ftrace_func(ftrace_func_t func)
  {
         unsigned long ip = (unsigned long)(&ftrace_call);
@@ -220,6 +235,14 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
  
         ret = ftrace_modify_code(ip, old, new);
  
+       /* Also update the regs callback function */
+       if (!ret) {
+               ip = (unsigned long)(&ftrace_regs_call);
+               memcpy(old, &ftrace_regs_call, MCOUNT_INSN_SIZE);
+               new = ftrace_call_replace(ip, (unsigned long)func);
+               ret = ftrace_modify_code(ip, old, new);
+       }
+
         atomic_dec(&modifying_ftrace_code);
  
         return ret;
@@ -299,6 +322,32 @@ static int add_brk_on_nop(struct dyn_ftrace *rec)
         return add_break(rec->ip, old);
  }
  
+/*
+ * If the record has the FTRACE_FL_REGS set, that means that it
+ * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
+ * is not not set, then it wants to convert to the normal callback.
+ */
+static unsigned long get_ftrace_addr(struct dyn_ftrace *rec)
+{
+       if (rec->flags & FTRACE_FL_REGS)
+               return (unsigned long)FTRACE_REGS_ADDR;
+       else
+               return (unsigned long)FTRACE_ADDR;
+}
+
+/*
+ * The FTRACE_FL_REGS_EN is set when the record already points to
+ * a function that saves all the regs. Basically the '_EN' version
+ * represents the current state of the function.
+ */
+static unsigned long get_ftrace_old_addr(struct dyn_ftrace *rec)
+{
+       if (rec->flags & FTRACE_FL_REGS_EN)
+               return (unsigned long)FTRACE_REGS_ADDR;
+       else
+               return (unsigned long)FTRACE_ADDR;
+}
+
  static int add_breakpoints(struct dyn_ftrace *rec, int enable)
  {
         unsigned long ftrace_addr;
@@ -306,7 +355,7 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable)
  
         ret = ftrace_test_record(rec, enable);
  
-       ftrace_addr = (unsigned long)FTRACE_ADDR;
+       ftrace_addr = get_ftrace_addr(rec);
  
         switch (ret) {
         case FTRACE_UPDATE_IGNORE:
@@ -316,6 +365,10 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable)
                 /* converting nop to call */
                 return add_brk_on_nop(rec);
  
+       case FTRACE_UPDATE_MODIFY_CALL_REGS:
+       case FTRACE_UPDATE_MODIFY_CALL:
+               ftrace_addr = get_ftrace_old_addr(rec);
+               /* fall through */
         case FTRACE_UPDATE_MAKE_NOP:
                 /* converting a call to a nop */
                 return add_brk_on_call(rec, ftrace_addr);
@@ -360,13 +413,21 @@ static int remove_breakpoint(struct dyn_ftrace *rec)
                  * If not, don't touch the breakpoint, we make just create
                  * a disaster.
                  */
-               ftrace_addr = (unsigned long)FTRACE_ADDR;
+               ftrace_addr = get_ftrace_addr(rec);
+               nop = ftrace_call_replace(ip, ftrace_addr);
+
+               if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
+                       goto update;
+
+               /* Check both ftrace_addr and ftrace_old_addr */
+               ftrace_addr = get_ftrace_old_addr(rec);
                 nop = ftrace_call_replace(ip, ftrace_addr);
  
                 if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
                         return -EINVAL;
         }
  
+ update:
         return probe_kernel_write((void *)ip, &nop[0], 1);
  }
  
@@ -405,12 +466,14 @@ static int add_update(struct dyn_ftrace *rec, int enable)
  
         ret = ftrace_test_record(rec, enable);
  
-       ftrace_addr = (unsigned long)FTRACE_ADDR;
+       ftrace_addr  = get_ftrace_addr(rec);
  
         switch (ret) {
         case FTRACE_UPDATE_IGNORE:
                 return 0;
  
+       case FTRACE_UPDATE_MODIFY_CALL_REGS:
+       case FTRACE_UPDATE_MODIFY_CALL:
         case FTRACE_UPDATE_MAKE_CALL:
                 /* converting nop to call */
                 return add_update_call(rec, ftrace_addr);
@@ -455,12 +518,14 @@ static int finish_update(struct dyn_ftrace *rec, int enable)
  
         ret = ftrace_update_record(rec, enable);
  
-       ftrace_addr = (unsigned long)FTRACE_ADDR;
+       ftrace_addr = get_ftrace_addr(rec);
  
         switch (ret) {
         case FTRACE_UPDATE_IGNORE:
                 return 0;
  
+       case FTRACE_UPDATE_MODIFY_CALL_REGS:
+       case FTRACE_UPDATE_MODIFY_CALL:
         case FTRACE_UPDATE_MAKE_CALL:
                 /* converting nop to call */
                 return finish_update_call(rec, ftrace_addr);
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c

index 36d1853e91af58c0ee2b9ee8df2c045229f22936..9a5c460404dca563a1e5ac32aeeb90ba2cdaafbf 100644 (file)
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -263,7 +263,7 @@ static void i8259A_shutdown(void)
          * out of.
          */
         outb(0xff, PIC_MASTER_IMR);     /* mask all of 8259A-1 */
-       outb(0xff, PIC_SLAVE_IMR);      /* mask all of 8259A-1 */
+       outb(0xff, PIC_SLAVE_IMR);      /* mask all of 8259A-2 */
  }
  
  static struct syscore_ops i8259_syscore_ops = {
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c

index e2f751efb7b1ef25ce9329a49128a48c1dcf9c3f..47ae1023a93c131b4e552b6c94f4a900e985b5a3 100644 (file)
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1052,6 +1052,54 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
         return 0;
  }
  
+#ifdef KPROBES_CAN_USE_FTRACE
+/* Ftrace callback handler for kprobes */
+void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
+                                    struct ftrace_ops *ops, struct pt_regs *regs)
+{
+       struct kprobe *p;
+       struct kprobe_ctlblk *kcb;
+       unsigned long flags;
+
+       /* Disable irq for emulating a breakpoint and avoiding preempt */
+       local_irq_save(flags);
+
+       p = get_kprobe((kprobe_opcode_t *)ip);
+       if (unlikely(!p) || kprobe_disabled(p))
+               goto end;
+
+       kcb = get_kprobe_ctlblk();
+       if (kprobe_running()) {
+               kprobes_inc_nmissed_count(p);
+       } else {
+               regs->ip += sizeof(kprobe_opcode_t);
+
+               __this_cpu_write(current_kprobe, p);
+               kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+               if (p->pre_handler)
+                       p->pre_handler(p, regs);
+
+               if (unlikely(p->post_handler)) {
+                       /* Emulate singlestep as if there is a 5byte nop */
+                       regs->ip = ip + MCOUNT_INSN_SIZE;
+                       kcb->kprobe_status = KPROBE_HIT_SSDONE;
+                       p->post_handler(p, regs, 0);
+               }
+               __this_cpu_write(current_kprobe, NULL);
+               regs->ip = ip;  /* Recover for next callback */
+       }
+end:
+       local_irq_restore(flags);
+}
+
+int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p)
+{
+       p->ainsn.insn = NULL;
+       p->ainsn.boostable = -1;
+       return 0;
+}
+#endif
+
  int __init arch_init_kprobes(void)
  {
         return arch_init_optprobes();
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c

index 82746f942cd8db4d59a32631049a79146564052e..5511216b44341b817c4c9f6be2b0765e3a3c833c 100644 (file)
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -75,20 +75,113 @@ struct microcode_amd {
  
  static struct equiv_cpu_entry *equiv_cpu_table;
  
-/* page-sized ucode patch buffer */
-void *patch;
+struct ucode_patch {
+       struct list_head plist;
+       void *data;
+       u32 patch_id;
+       u16 equiv_cpu;
+};
+
+static LIST_HEAD(pcache);
+
+static u16 find_equiv_id(unsigned int cpu)
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+       int i = 0;
+
+       if (!equiv_cpu_table)
+               return 0;
+
+       while (equiv_cpu_table[i].installed_cpu != 0) {
+               if (uci->cpu_sig.sig == equiv_cpu_table[i].installed_cpu)
+                       return equiv_cpu_table[i].equiv_cpu;
+
+               i++;
+       }
+       return 0;
+}
+
+static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu)
+{
+       int i = 0;
+
+       BUG_ON(!equiv_cpu_table);
+
+       while (equiv_cpu_table[i].equiv_cpu != 0) {
+               if (equiv_cpu == equiv_cpu_table[i].equiv_cpu)
+                       return equiv_cpu_table[i].installed_cpu;
+               i++;
+       }
+       return 0;
+}
+
+/*
+ * a small, trivial cache of per-family ucode patches
+ */
+static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
+{
+       struct ucode_patch *p;
+
+       list_for_each_entry(p, &pcache, plist)
+               if (p->equiv_cpu == equiv_cpu)
+                       return p;
+       return NULL;
+}
+
+static void update_cache(struct ucode_patch *new_patch)
+{
+       struct ucode_patch *p;
+
+       list_for_each_entry(p, &pcache, plist) {
+               if (p->equiv_cpu == new_patch->equiv_cpu) {
+                       if (p->patch_id >= new_patch->patch_id)
+                               /* we already have the latest patch */
+                               return;
+
+                       list_replace(&p->plist, &new_patch->plist);
+                       kfree(p->data);
+                       kfree(p);
+                       return;
+               }
+       }
+       /* no patch found, add it */
+       list_add_tail(&new_patch->plist, &pcache);
+}
+
+static void free_cache(void)
+{
+       struct ucode_patch *p;
+
+       list_for_each_entry_reverse(p, &pcache, plist) {
+               __list_del(p->plist.prev, p->plist.next);
+               kfree(p->data);
+               kfree(p);
+       }
+}
+
+static struct ucode_patch *find_patch(unsigned int cpu)
+{
+       u16 equiv_id;
+
+       equiv_id = find_equiv_id(cpu);
+       if (!equiv_id)
+               return NULL;
+
+       return cache_find_patch(equiv_id);
+}
  
  static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
  {
         struct cpuinfo_x86 *c = &cpu_data(cpu);
  
+       csig->sig = cpuid_eax(0x00000001);
         csig->rev = c->microcode;
         pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
  
         return 0;
  }
  
-static unsigned int verify_ucode_size(int cpu, u32 patch_size,
+static unsigned int verify_patch_size(int cpu, u32 patch_size,
                                       unsigned int size)
  {
         struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -118,95 +211,37 @@ static unsigned int verify_ucode_size(int cpu, u32 patch_size,
         return patch_size;
  }
  
-static u16 find_equiv_id(void)
+static int apply_microcode_amd(int cpu)
  {
-       unsigned int current_cpu_id, i = 0;
-
-       BUG_ON(equiv_cpu_table == NULL);
-
-       current_cpu_id = cpuid_eax(0x00000001);
-
-       while (equiv_cpu_table[i].installed_cpu != 0) {
-               if (current_cpu_id == equiv_cpu_table[i].installed_cpu)
-                       return equiv_cpu_table[i].equiv_cpu;
-
-               i++;
-       }
-       return 0;
-}
+       struct cpuinfo_x86 *c = &cpu_data(cpu);
+       struct microcode_amd *mc_amd;
+       struct ucode_cpu_info *uci;
+       struct ucode_patch *p;
+       u32 rev, dummy;
  
-/*
- * we signal a good patch is found by returning its size > 0
- */
-static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
-                                 unsigned int leftover_size, int rev,
-                                 unsigned int *current_size)
-{
-       struct microcode_header_amd *mc_hdr;
-       unsigned int actual_size, patch_size;
-       u16 equiv_cpu_id;
+       BUG_ON(raw_smp_processor_id() != cpu);
  
-       /* size of the current patch we're staring at */
-       patch_size = *(u32 *)(ucode_ptr + 4);
-       *current_size = patch_size + SECTION_HDR_SIZE;
+       uci = ucode_cpu_info + cpu;
  
-       equiv_cpu_id = find_equiv_id();
-       if (!equiv_cpu_id)
+       p = find_patch(cpu);
+       if (!p)
                 return 0;
  
-       /*
-        * let's look at the patch header itself now
-        */
-       mc_hdr = (struct microcode_header_amd *)(ucode_ptr + SECTION_HDR_SIZE);
+       mc_amd  = p->data;
+       uci->mc = p->data;
  
-       if (mc_hdr->processor_rev_id != equiv_cpu_id)
-               return 0;
+       rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
  
-       /* ucode might be chipset specific -- currently we don't support this */
-       if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
-               pr_err("CPU%d: chipset specific code not yet supported\n",
-                      cpu);
+       /* need to apply patch? */
+       if (rev >= mc_amd->hdr.patch_id) {
+               c->microcode = rev;
                 return 0;
         }
  
-       if (mc_hdr->patch_id <= rev)
-               return 0;
-
-       /*
-        * now that the header looks sane, verify its size
-        */
-       actual_size = verify_ucode_size(cpu, patch_size, leftover_size);
-       if (!actual_size)
-               return 0;
-
-       /* clear the patch buffer */
-       memset(patch, 0, PAGE_SIZE);
-
-       /* all looks ok, get the binary patch */
-       get_ucode_data(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size);
-
-       return actual_size;
-}
-
-static int apply_microcode_amd(int cpu)
-{
-       u32 rev, dummy;
-       int cpu_num = raw_smp_processor_id();
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-       struct microcode_amd *mc_amd = uci->mc;
-       struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-       /* We should bind the task to the CPU */
-       BUG_ON(cpu_num != cpu);
-
-       if (mc_amd == NULL)
-               return 0;
-
         wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
-       /* get patch id after patching */
-       rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
  
-       /* check current patch id and patch's id for match */
+       /* verify patch application was successful */
+       rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
         if (rev != mc_amd->hdr.patch_id) {
                 pr_err("CPU%d: update failed for patch_level=0x%08x\n",
                        cpu, mc_amd->hdr.patch_id);
@@ -238,7 +273,7 @@ static int install_equiv_cpu_table(const u8 *buf)
                 return -ENOMEM;
         }
  
-       get_ucode_data(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
+       memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
  
         /* add header length */
         return size + CONTAINER_HDR_SZ;
@@ -250,61 +285,113 @@ static void free_equiv_cpu_table(void)
         equiv_cpu_table = NULL;
  }
  
-static enum ucode_state
-generic_load_microcode(int cpu, const u8 *data, size_t size)
+static void cleanup(void)
  {
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-       struct microcode_header_amd *mc_hdr = NULL;
-       unsigned int mc_size, leftover, current_size = 0;
+       free_equiv_cpu_table();
+       free_cache();
+}
+
+/*
+ * We return the current size even if some of the checks failed so that
+ * we can skip over the next patch. If we return a negative value, we
+ * signal a grave error like a memory allocation has failed and the
+ * driver cannot continue functioning normally. In such cases, we tear
+ * down everything we've used up so far and exit.
+ */
+static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover)
+{
+       struct cpuinfo_x86 *c = &cpu_data(cpu);
+       struct microcode_header_amd *mc_hdr;
+       struct ucode_patch *patch;
+       unsigned int patch_size, crnt_size, ret;
+       u32 proc_fam;
+       u16 proc_id;
+
+       patch_size  = *(u32 *)(fw + 4);
+       crnt_size   = patch_size + SECTION_HDR_SIZE;
+       mc_hdr      = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE);
+       proc_id     = mc_hdr->processor_rev_id;
+
+       proc_fam = find_cpu_family_by_equiv_cpu(proc_id);
+       if (!proc_fam) {
+               pr_err("No patch family for equiv ID: 0x%04x\n", proc_id);
+               return crnt_size;
+       }
+
+       /* check if patch is for the current family */
+       proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff);
+       if (proc_fam != c->x86)
+               return crnt_size;
+
+       if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
+               pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n",
+                       mc_hdr->patch_id);
+               return crnt_size;
+       }
+
+       ret = verify_patch_size(cpu, patch_size, leftover);
+       if (!ret) {
+               pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id);
+               return crnt_size;
+       }
+
+       patch = kzalloc(sizeof(*patch), GFP_KERNEL);
+       if (!patch) {
+               pr_err("Patch allocation failure.\n");
+               return -EINVAL;
+       }
+
+       patch->data = kzalloc(patch_size, GFP_KERNEL);
+       if (!patch->data) {
+               pr_err("Patch data allocation failure.\n");
+               kfree(patch);
+               return -EINVAL;
+       }
+
+       /* All looks ok, copy patch... */
+       memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size);
+       INIT_LIST_HEAD(&patch->plist);
+       patch->patch_id  = mc_hdr->patch_id;
+       patch->equiv_cpu = proc_id;
+
+       /* ... and add to cache. */
+       update_cache(patch);
+
+       return crnt_size;
+}
+
+static enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size)
+{
+       enum ucode_state ret = UCODE_ERROR;
+       unsigned int leftover;
+       u8 *fw = (u8 *)data;
+       int crnt_size = 0;
         int offset;
-       const u8 *ucode_ptr = data;
-       void *new_mc = NULL;
-       unsigned int new_rev = uci->cpu_sig.rev;
-       enum ucode_state state = UCODE_ERROR;
  
-       offset = install_equiv_cpu_table(ucode_ptr);
+       offset = install_equiv_cpu_table(data);
         if (offset < 0) {
                 pr_err("failed to create equivalent cpu table\n");
-               goto out;
+               return ret;
         }
-       ucode_ptr += offset;
+       fw += offset;
         leftover = size - offset;
  
-       if (*(u32 *)ucode_ptr != UCODE_UCODE_TYPE) {
+       if (*(u32 *)fw != UCODE_UCODE_TYPE) {
                 pr_err("invalid type field in container file section header\n");
-               goto free_table;
+               free_equiv_cpu_table();
+               return ret;
         }
  
         while (leftover) {
-               mc_size = get_matching_microcode(cpu, ucode_ptr, leftover,
-                                                new_rev, &current_size);
-               if (mc_size) {
-                       mc_hdr  = patch;
-                       new_mc  = patch;
-                       new_rev = mc_hdr->patch_id;
-                       goto out_ok;
-               }
-
-               ucode_ptr += current_size;
-               leftover  -= current_size;
-       }
+               crnt_size = verify_and_add_patch(cpu, fw, leftover);
+               if (crnt_size < 0)
+                       return ret;
  
-       if (!new_mc) {
-               state = UCODE_NFOUND;
-               goto free_table;
+               fw       += crnt_size;
+               leftover -= crnt_size;
         }
  
-out_ok:
-       uci->mc = new_mc;
-       state = UCODE_OK;
-       pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
-                cpu, uci->cpu_sig.rev, new_rev);
-
-free_table:
-       free_equiv_cpu_table();
-
-out:
-       return state;
+       return UCODE_OK;
  }
  
  /*
@@ -315,7 +402,7 @@ out:
   *
   * This legacy file is always smaller than 2K in size.
   *
- * Starting at family 15h they are in family specific firmware files:
+ * Beginning with family 15h, they are in family-specific firmware files:
   *
   *    amd-ucode/microcode_amd_fam15h.bin
   *    amd-ucode/microcode_amd_fam16h.bin
@@ -323,12 +410,17 @@ out:
   *
   * These might be larger than 2K.
   */
-static enum ucode_state request_microcode_amd(int cpu, struct device *device)
+static enum ucode_state request_microcode_amd(int cpu, struct device *device,
+                                             bool refresh_fw)
  {
         char fw_name[36] = "amd-ucode/microcode_amd.bin";
-       const struct firmware *fw;
-       enum ucode_state ret = UCODE_NFOUND;
         struct cpuinfo_x86 *c = &cpu_data(cpu);
+       enum ucode_state ret = UCODE_NFOUND;
+       const struct firmware *fw;
+
+       /* reload ucode container only on the boot cpu */
+       if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index)
+               return UCODE_OK;
  
         if (c->x86 >= 0x15)
                 snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
@@ -344,12 +436,17 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device)
                 goto fw_release;
         }
  
-       ret = generic_load_microcode(cpu, fw->data, fw->size);
+       /* free old equiv table */
+       free_equiv_cpu_table();
+
+       ret = load_microcode_amd(cpu, fw->data, fw->size);
+       if (ret != UCODE_OK)
+               cleanup();
  
-fw_release:
+ fw_release:
         release_firmware(fw);
  
-out:
+ out:
         return ret;
  }
  
@@ -383,14 +480,10 @@ struct microcode_ops * __init init_amd_microcode(void)
                 return NULL;
         }
  
-       patch = (void *)get_zeroed_page(GFP_KERNEL);
-       if (!patch)
-               return NULL;
-
         return &microcode_amd_ops;
  }
  
  void __exit exit_amd_microcode(void)
  {
-       free_page((unsigned long)patch);
+       cleanup();
  }
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c

index 4873e62db6a18468b23736c5f4adfd2de8b3b85b..3a04b224d0c0e71cb8886c7c541500516f1f73be 100644 (file)
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -225,6 +225,9 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
         if (do_microcode_update(buf, len) == 0)
                 ret = (ssize_t)len;
  
+       if (ret > 0)
+               perf_check_microcode();
+
         mutex_unlock(&microcode_mutex);
         put_online_cpus();
  
@@ -276,19 +279,18 @@ static struct platform_device     *microcode_pdev;
  static int reload_for_cpu(int cpu)
  {
         struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+       enum ucode_state ustate;
         int err = 0;
  
-       if (uci->valid) {
-               enum ucode_state ustate;
-
-               ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
-               if (ustate == UCODE_OK)
-                       apply_microcode_on_target(cpu);
-               else
-                       if (ustate == UCODE_ERROR)
-                               err = -EINVAL;
-       }
+       if (!uci->valid)
+               return err;
  
+       ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev, true);
+       if (ustate == UCODE_OK)
+               apply_microcode_on_target(cpu);
+       else
+               if (ustate == UCODE_ERROR)
+                       err = -EINVAL;
         return err;
  }
  
@@ -370,18 +372,15 @@ static void microcode_fini_cpu(int cpu)
  
  static enum ucode_state microcode_resume_cpu(int cpu)
  {
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-       if (!uci->mc)
-               return UCODE_NFOUND;
-
         pr_debug("CPU%d updated upon resume\n", cpu);
-       apply_microcode_on_target(cpu);
+
+       if (apply_microcode_on_target(cpu))
+               return UCODE_ERROR;
  
         return UCODE_OK;
  }
  
-static enum ucode_state microcode_init_cpu(int cpu)
+static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
  {
         enum ucode_state ustate;
  
@@ -392,7 +391,8 @@ static enum ucode_state microcode_init_cpu(int cpu)
         if (system_state != SYSTEM_RUNNING)
                 return UCODE_NFOUND;
  
-       ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
+       ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev,
+                                                    refresh_fw);
  
         if (ustate == UCODE_OK) {
                 pr_debug("CPU%d updated upon init\n", cpu);
@@ -405,14 +405,11 @@ static enum ucode_state microcode_init_cpu(int cpu)
  static enum ucode_state microcode_update_cpu(int cpu)
  {
         struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-       enum ucode_state ustate;
  
         if (uci->valid)
-               ustate = microcode_resume_cpu(cpu);
-       else
-               ustate = microcode_init_cpu(cpu);
+               return microcode_resume_cpu(cpu);
  
-       return ustate;
+       return microcode_init_cpu(cpu, false);
  }
  
  static int mc_device_add(struct device *dev, struct subsys_interface *sif)
@@ -428,7 +425,7 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif)
         if (err)
                 return err;
  
-       if (microcode_init_cpu(cpu) == UCODE_ERROR)
+       if (microcode_init_cpu(cpu, true) == UCODE_ERROR)
                 return -EINVAL;
  
         return err;
@@ -477,34 +474,41 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
         struct device *dev;
  
         dev = get_cpu_device(cpu);
-       switch (action) {
+
+       switch (action & ~CPU_TASKS_FROZEN) {
         case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
                 microcode_update_cpu(cpu);
-       case CPU_DOWN_FAILED:
-       case CPU_DOWN_FAILED_FROZEN:
                 pr_debug("CPU%d added\n", cpu);
+               /*
+                * "break" is missing on purpose here because we want to fall
+                * through in order to create the sysfs group.
+                */
+
+       case CPU_DOWN_FAILED:
                 if (sysfs_create_group(&dev->kobj, &mc_attr_group))
                         pr_err("Failed to create group for CPU%d\n", cpu);
                 break;
+
         case CPU_DOWN_PREPARE:
-       case CPU_DOWN_PREPARE_FROZEN:
                 /* Suspend is in progress, only remove the interface */
                 sysfs_remove_group(&dev->kobj, &mc_attr_group);
                 pr_debug("CPU%d removed\n", cpu);
                 break;
  
         /*
+        * case CPU_DEAD:
+        *
          * When a CPU goes offline, don't free up or invalidate the copy of
          * the microcode in kernel memory, so that we can reuse it when the
          * CPU comes back online without unnecessarily requesting the userspace
          * for it again.
          */
-       case CPU_UP_CANCELED_FROZEN:
-               /* The CPU refused to come up during a system resume */
-               microcode_fini_cpu(cpu);
-               break;
         }
+
+       /* The CPU refused to come up during a system resume */
+       if (action == CPU_UP_CANCELED_FROZEN)
+               microcode_fini_cpu(cpu);
+
         return NOTIFY_OK;
  }
  
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c

index 0327e2b3c40869a0a767d8885572904a7f1cb2d4..3544aed3933816b9d9acd3d0784380696c4dce50 100644 (file)
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -405,7 +405,8 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
         return 0;
  }
  
-static enum ucode_state request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device,
+                                            bool refresh_fw)
  {
         char name[30];
         struct cpuinfo_x86 *c = &cpu_data(cpu);
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c

new file mode 100644 (file)

index 0000000..c5a3e5c
--- /dev/null
+++ b/arch/x86/kernel/perf_regs.c
@@ -0,0 +1,105 @@
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_event.h>
+#include <linux/bug.h>
+#include <linux/stddef.h>
+#include <asm/perf_regs.h>
+#include <asm/ptrace.h>
+
+#ifdef CONFIG_X86_32
+#define PERF_REG_X86_MAX PERF_REG_X86_32_MAX
+#else
+#define PERF_REG_X86_MAX PERF_REG_X86_64_MAX
+#endif
+
+#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r)
+
+static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = {
+       PT_REGS_OFFSET(PERF_REG_X86_AX, ax),
+       PT_REGS_OFFSET(PERF_REG_X86_BX, bx),
+       PT_REGS_OFFSET(PERF_REG_X86_CX, cx),
+       PT_REGS_OFFSET(PERF_REG_X86_DX, dx),
+       PT_REGS_OFFSET(PERF_REG_X86_SI, si),
+       PT_REGS_OFFSET(PERF_REG_X86_DI, di),
+       PT_REGS_OFFSET(PERF_REG_X86_BP, bp),
+       PT_REGS_OFFSET(PERF_REG_X86_SP, sp),
+       PT_REGS_OFFSET(PERF_REG_X86_IP, ip),
+       PT_REGS_OFFSET(PERF_REG_X86_FLAGS, flags),
+       PT_REGS_OFFSET(PERF_REG_X86_CS, cs),
+       PT_REGS_OFFSET(PERF_REG_X86_SS, ss),
+#ifdef CONFIG_X86_32
+       PT_REGS_OFFSET(PERF_REG_X86_DS, ds),
+       PT_REGS_OFFSET(PERF_REG_X86_ES, es),
+       PT_REGS_OFFSET(PERF_REG_X86_FS, fs),
+       PT_REGS_OFFSET(PERF_REG_X86_GS, gs),
+#else
+       /*
+        * The pt_regs struct does not store
+        * ds, es, fs, gs in 64 bit mode.
+        */
+       (unsigned int) -1,
+       (unsigned int) -1,
+       (unsigned int) -1,
+       (unsigned int) -1,
+#endif
+#ifdef CONFIG_X86_64
+       PT_REGS_OFFSET(PERF_REG_X86_R8, r8),
+       PT_REGS_OFFSET(PERF_REG_X86_R9, r9),
+       PT_REGS_OFFSET(PERF_REG_X86_R10, r10),
+       PT_REGS_OFFSET(PERF_REG_X86_R11, r11),
+       PT_REGS_OFFSET(PERF_REG_X86_R12, r12),
+       PT_REGS_OFFSET(PERF_REG_X86_R13, r13),
+       PT_REGS_OFFSET(PERF_REG_X86_R14, r14),
+       PT_REGS_OFFSET(PERF_REG_X86_R15, r15),
+#endif
+};
+
+u64 perf_reg_value(struct pt_regs *regs, int idx)
+{
+       if (WARN_ON_ONCE(idx > ARRAY_SIZE(pt_regs_offset)))
+               return 0;
+
+       return regs_get_register(regs, pt_regs_offset[idx]);
+}
+
+#define REG_RESERVED (~((1ULL << PERF_REG_X86_MAX) - 1ULL))
+
+#ifdef CONFIG_X86_32
+int perf_reg_validate(u64 mask)
+{
+       if (!mask || mask & REG_RESERVED)
+               return -EINVAL;
+
+       return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+       return PERF_SAMPLE_REGS_ABI_32;
+}
+#else /* CONFIG_X86_64 */
+#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \
+                      (1ULL << PERF_REG_X86_ES) | \
+                      (1ULL << PERF_REG_X86_FS) | \
+                      (1ULL << PERF_REG_X86_GS))
+
+int perf_reg_validate(u64 mask)
+{
+       if (!mask || mask & REG_RESERVED)
+               return -EINVAL;
+
+       if (mask & REG_NOSUPPORT)
+               return -EINVAL;
+
+       return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+       if (test_tsk_thread_flag(task, TIF_IA32))
+               return PERF_SAMPLE_REGS_ABI_32;
+       else
+               return PERF_SAMPLE_REGS_ABI_64;
+}
+#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c

index 7c5a8c314c0268a2ba0b101802c057ad428d8b42..c80a33bc528b8e33ec6a1b9b4e6ca02964c21d7e 100644 (file)
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -665,7 +665,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
         unsigned long boot_error = 0;
         int timeout;
  
-       alternatives_smp_switch(1);
+       /* Just in case we booted with a single CPU. */
+       alternatives_enable_smp();
  
         idle->thread.sp = (unsigned long) (((struct pt_regs *)
                           (THREAD_SIZE +  task_stack_page(idle))) - 1);
@@ -1053,20 +1054,6 @@ out:
         preempt_enable();
  }
  
-void arch_disable_nonboot_cpus_begin(void)
-{
-       /*
-        * Avoid the smp alternatives switch during the disable_nonboot_cpus().
-        * In the suspend path, we will be back in the SMP mode shortly anyways.
-        */
-       skip_smp_alternatives = true;
-}
-
-void arch_disable_nonboot_cpus_end(void)
-{
-       skip_smp_alternatives = false;
-}
-
  void arch_enable_nonboot_cpus_begin(void)
  {
         set_mtrr_aps_delayed_init();
@@ -1256,9 +1243,6 @@ void native_cpu_die(unsigned int cpu)
                 if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
                         if (system_state == SYSTEM_RUNNING)
                                 pr_info("CPU %u is now offline\n", cpu);
-
-                       if (1 == num_online_cpus())
-                               alternatives_smp_switch(0);
                         return;
                 }
                 msleep(100);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c

index 6020f6f5927cbc1035b7f86b1f19f1422ded0acf..1330dd10295031544d08788320524fb917b889bc 100644 (file)
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -13,9 +13,13 @@
  #include <asm/ftrace.h>
  
  #ifdef CONFIG_FUNCTION_TRACER
-/* mcount is defined in assembly */
+/* mcount and __fentry__ are defined in assembly */
+#ifdef CC_USING_FENTRY
+EXPORT_SYMBOL(__fentry__);
+#else
  EXPORT_SYMBOL(mcount);
  #endif
+#endif
  
  EXPORT_SYMBOL(__get_user_1);
  EXPORT_SYMBOL(__get_user_2);
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c

index f58dca7a6e52cef85503753f4b500d4d8f7ec369..353c50f18702cfc2ff93118ce3a18b50c6dfdd5c 100644 (file)
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -377,7 +377,8 @@ static int __cpuinit xen_cpu_up(unsigned int cpu, struct task_struct *idle)
                 return rc;
  
         if (num_online_cpus() == 1)
-               alternatives_smp_switch(1);
+               /* Just in case we booted with a single CPU. */
+               alternatives_enable_smp();
  
         rc = xen_smp_intr_init(cpu);
         if (rc)
@@ -424,9 +425,6 @@ static void xen_cpu_die(unsigned int cpu)
         unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL);
         xen_uninit_lock_cpu(cpu);
         xen_teardown_timer(cpu);
-
-       if (num_online_cpus() == 1)
-               alternatives_smp_switch(0);
  }
  
  static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c

index 53589000fd0726d95a8d817505f131e165909f56..8615d7cf7e01592a415acbcd309f8e58e4c6ed16 100644 (file)
--- a/drivers/infiniband/hw/ehca/ehca_irq.c
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -42,6 +42,7 @@
   */
  
  #include <linux/slab.h>
+#include <linux/smpboot.h>
  
  #include "ehca_classes.h"
  #include "ehca_irq.h"
@@ -652,7 +653,7 @@ void ehca_tasklet_eq(unsigned long data)
         ehca_process_eq((struct ehca_shca*)data, 1);
  }
  
-static inline int find_next_online_cpu(struct ehca_comp_pool *pool)
+static int find_next_online_cpu(struct ehca_comp_pool *pool)
  {
         int cpu;
         unsigned long flags;
@@ -662,17 +663,20 @@ static inline int find_next_online_cpu(struct ehca_comp_pool *pool)
                 ehca_dmp(cpu_online_mask, cpumask_size(), "");
  
         spin_lock_irqsave(&pool->last_cpu_lock, flags);
-       cpu = cpumask_next(pool->last_cpu, cpu_online_mask);
-       if (cpu >= nr_cpu_ids)
-               cpu = cpumask_first(cpu_online_mask);
-       pool->last_cpu = cpu;
+       do {
+               cpu = cpumask_next(pool->last_cpu, cpu_online_mask);
+               if (cpu >= nr_cpu_ids)
+                       cpu = cpumask_first(cpu_online_mask);
+               pool->last_cpu = cpu;
+       } while (!per_cpu_ptr(pool->cpu_comp_tasks, cpu)->active);
         spin_unlock_irqrestore(&pool->last_cpu_lock, flags);
  
         return cpu;
  }
  
  static void __queue_comp_task(struct ehca_cq *__cq,
-                             struct ehca_cpu_comp_task *cct)
+                             struct ehca_cpu_comp_task *cct,
+                             struct task_struct *thread)
  {
         unsigned long flags;
  
@@ -683,7 +687,7 @@ static void __queue_comp_task(struct ehca_cq *__cq,
                 __cq->nr_callbacks++;
                 list_add_tail(&__cq->entry, &cct->cq_list);
                 cct->cq_jobs++;
-               wake_up(&cct->wait_queue);
+               wake_up_process(thread);
         } else
                 __cq->nr_callbacks++;
  
@@ -695,6 +699,7 @@ static void queue_comp_task(struct ehca_cq *__cq)
  {
         int cpu_id;
         struct ehca_cpu_comp_task *cct;
+       struct task_struct *thread;
         int cq_jobs;
         unsigned long flags;
  
@@ -702,7 +707,8 @@ static void queue_comp_task(struct ehca_cq *__cq)
         BUG_ON(!cpu_online(cpu_id));
  
         cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
-       BUG_ON(!cct);
+       thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
+       BUG_ON(!cct || !thread);
  
         spin_lock_irqsave(&cct->task_lock, flags);
         cq_jobs = cct->cq_jobs;
@@ -710,28 +716,25 @@ static void queue_comp_task(struct ehca_cq *__cq)
         if (cq_jobs > 0) {
                 cpu_id = find_next_online_cpu(pool);
                 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
-               BUG_ON(!cct);
+               thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
+               BUG_ON(!cct || !thread);
         }
-
-       __queue_comp_task(__cq, cct);
+       __queue_comp_task(__cq, cct, thread);
  }
  
  static void run_comp_task(struct ehca_cpu_comp_task *cct)
  {
         struct ehca_cq *cq;
-       unsigned long flags;
-
-       spin_lock_irqsave(&cct->task_lock, flags);
  
         while (!list_empty(&cct->cq_list)) {
                 cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
-               spin_unlock_irqrestore(&cct->task_lock, flags);
+               spin_unlock_irq(&cct->task_lock);
  
                 comp_event_callback(cq);
                 if (atomic_dec_and_test(&cq->nr_events))
                         wake_up(&cq->wait_completion);
  
-               spin_lock_irqsave(&cct->task_lock, flags);
+               spin_lock_irq(&cct->task_lock);
                 spin_lock(&cq->task_lock);
                 cq->nr_callbacks--;
                 if (!cq->nr_callbacks) {
@@ -740,159 +743,76 @@ static void run_comp_task(struct ehca_cpu_comp_task *cct)
                 }
                 spin_unlock(&cq->task_lock);
         }
-
-       spin_unlock_irqrestore(&cct->task_lock, flags);
  }
  
-static int comp_task(void *__cct)
+static void comp_task_park(unsigned int cpu)
  {
-       struct ehca_cpu_comp_task *cct = __cct;
-       int cql_empty;
-       DECLARE_WAITQUEUE(wait, current);
-
-       set_current_state(TASK_INTERRUPTIBLE);
-       while (!kthread_should_stop()) {
-               add_wait_queue(&cct->wait_queue, &wait);
-
-               spin_lock_irq(&cct->task_lock);
-               cql_empty = list_empty(&cct->cq_list);
-               spin_unlock_irq(&cct->task_lock);
-               if (cql_empty)
-                       schedule();
-               else
-                       __set_current_state(TASK_RUNNING);
-
-               remove_wait_queue(&cct->wait_queue, &wait);
+       struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+       struct ehca_cpu_comp_task *target;
+       struct task_struct *thread;
+       struct ehca_cq *cq, *tmp;
+       LIST_HEAD(list);
  
-               spin_lock_irq(&cct->task_lock);
-               cql_empty = list_empty(&cct->cq_list);
-               spin_unlock_irq(&cct->task_lock);
-               if (!cql_empty)
-                       run_comp_task(__cct);
+       spin_lock_irq(&cct->task_lock);
+       cct->cq_jobs = 0;
+       cct->active = 0;
+       list_splice_init(&cct->cq_list, &list);
+       spin_unlock_irq(&cct->task_lock);
  
-               set_current_state(TASK_INTERRUPTIBLE);
+       cpu = find_next_online_cpu(pool);
+       target = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+       thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu);
+       spin_lock_irq(&target->task_lock);
+       list_for_each_entry_safe(cq, tmp, &list, entry) {
+               list_del(&cq->entry);
+               __queue_comp_task(cq, target, thread);
         }
-       __set_current_state(TASK_RUNNING);
-
-       return 0;
-}
-
-static struct task_struct *create_comp_task(struct ehca_comp_pool *pool,
-                                           int cpu)
-{
-       struct ehca_cpu_comp_task *cct;
-
-       cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-       spin_lock_init(&cct->task_lock);
-       INIT_LIST_HEAD(&cct->cq_list);
-       init_waitqueue_head(&cct->wait_queue);
-       cct->task = kthread_create_on_node(comp_task, cct, cpu_to_node(cpu),
-                                          "ehca_comp/%d", cpu);
-
-       return cct->task;
+       spin_unlock_irq(&target->task_lock);
  }
  
-static void destroy_comp_task(struct ehca_comp_pool *pool,
-                             int cpu)
+static void comp_task_stop(unsigned int cpu, bool online)
  {
-       struct ehca_cpu_comp_task *cct;
-       struct task_struct *task;
-       unsigned long flags_cct;
-
-       cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-
-       spin_lock_irqsave(&cct->task_lock, flags_cct);
+       struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
  
-       task = cct->task;
-       cct->task = NULL;
+       spin_lock_irq(&cct->task_lock);
         cct->cq_jobs = 0;
-
-       spin_unlock_irqrestore(&cct->task_lock, flags_cct);
-
-       if (task)
-               kthread_stop(task);
+       cct->active = 0;
+       WARN_ON(!list_empty(&cct->cq_list));
+       spin_unlock_irq(&cct->task_lock);
  }
  
-static void __cpuinit take_over_work(struct ehca_comp_pool *pool, int cpu)
+static int comp_task_should_run(unsigned int cpu)
  {
         struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-       LIST_HEAD(list);
-       struct ehca_cq *cq;
-       unsigned long flags_cct;
-
-       spin_lock_irqsave(&cct->task_lock, flags_cct);
-
-       list_splice_init(&cct->cq_list, &list);
-
-       while (!list_empty(&list)) {
-               cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
-
-               list_del(&cq->entry);
-               __queue_comp_task(cq, this_cpu_ptr(pool->cpu_comp_tasks));
-       }
-
-       spin_unlock_irqrestore(&cct->task_lock, flags_cct);
  
+       return cct->cq_jobs;
  }
  
-static int __cpuinit comp_pool_callback(struct notifier_block *nfb,
-                                       unsigned long action,
-                                       void *hcpu)
+static void comp_task(unsigned int cpu)
  {
-       unsigned int cpu = (unsigned long)hcpu;
-       struct ehca_cpu_comp_task *cct;
+       struct ehca_cpu_comp_task *cct = this_cpu_ptr(pool->cpu_comp_tasks);
+       int cql_empty;
  
-       switch (action) {
-       case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
-               ehca_gen_dbg("CPU: %x (CPU_PREPARE)", cpu);
-               if (!create_comp_task(pool, cpu)) {
-                       ehca_gen_err("Can't create comp_task for cpu: %x", cpu);
-                       return notifier_from_errno(-ENOMEM);
-               }
-               break;
-       case CPU_UP_CANCELED:
-       case CPU_UP_CANCELED_FROZEN:
-               ehca_gen_dbg("CPU: %x (CPU_CANCELED)", cpu);
-               cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-               kthread_bind(cct->task, cpumask_any(cpu_online_mask));
-               destroy_comp_task(pool, cpu);
-               break;
-       case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
-               ehca_gen_dbg("CPU: %x (CPU_ONLINE)", cpu);
-               cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-               kthread_bind(cct->task, cpu);
-               wake_up_process(cct->task);
-               break;
-       case CPU_DOWN_PREPARE:
-       case CPU_DOWN_PREPARE_FROZEN:
-               ehca_gen_dbg("CPU: %x (CPU_DOWN_PREPARE)", cpu);
-               break;
-       case CPU_DOWN_FAILED:
-       case CPU_DOWN_FAILED_FROZEN:
-               ehca_gen_dbg("CPU: %x (CPU_DOWN_FAILED)", cpu);
-               break;
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               ehca_gen_dbg("CPU: %x (CPU_DEAD)", cpu);
-               destroy_comp_task(pool, cpu);
-               take_over_work(pool, cpu);
-               break;
+       spin_lock_irq(&cct->task_lock);
+       cql_empty = list_empty(&cct->cq_list);
+       if (!cql_empty) {
+               __set_current_state(TASK_RUNNING);
+               run_comp_task(cct);
         }
-
-       return NOTIFY_OK;
+       spin_unlock_irq(&cct->task_lock);
  }
  
-static struct notifier_block comp_pool_callback_nb __cpuinitdata = {
-       .notifier_call  = comp_pool_callback,
-       .priority       = 0,
+static struct smp_hotplug_thread comp_pool_threads = {
+       .thread_should_run      = comp_task_should_run,
+       .thread_fn              = comp_task,
+       .thread_comm            = "ehca_comp/%u",
+       .cleanup                = comp_task_stop,
+       .park                   = comp_task_park,
  };
  
  int ehca_create_comp_pool(void)
  {
-       int cpu;
-       struct task_struct *task;
+       int cpu, ret = -ENOMEM;
  
         if (!ehca_scaling_code)
                 return 0;
@@ -905,38 +825,46 @@ int ehca_create_comp_pool(void)
         pool->last_cpu = cpumask_any(cpu_online_mask);
  
         pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task);
-       if (pool->cpu_comp_tasks == NULL) {
-               kfree(pool);
-               return -EINVAL;
-       }
+       if (!pool->cpu_comp_tasks)
+               goto out_pool;
  
-       for_each_online_cpu(cpu) {
-               task = create_comp_task(pool, cpu);
-               if (task) {
-                       kthread_bind(task, cpu);
-                       wake_up_process(task);
-               }
+       pool->cpu_comp_threads = alloc_percpu(struct task_struct *);
+       if (!pool->cpu_comp_threads)
+               goto out_tasks;
+
+       for_each_present_cpu(cpu) {
+               struct ehca_cpu_comp_task *cct;
+
+               cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+               spin_lock_init(&cct->task_lock);
+               INIT_LIST_HEAD(&cct->cq_list);
         }
  
-       register_hotcpu_notifier(&comp_pool_callback_nb);
+       comp_pool_threads.store = pool->cpu_comp_threads;
+       ret = smpboot_register_percpu_thread(&comp_pool_threads);
+       if (ret)
+               goto out_threads;
  
-       printk(KERN_INFO "eHCA scaling code enabled\n");
+       pr_info("eHCA scaling code enabled\n");
+       return ret;
  
-       return 0;
+out_threads:
+       free_percpu(pool->cpu_comp_threads);
+out_tasks:
+       free_percpu(pool->cpu_comp_tasks);
+out_pool:
+       kfree(pool);
+       return ret;
  }
  
  void ehca_destroy_comp_pool(void)
  {
-       int i;
-
         if (!ehca_scaling_code)
                 return;
  
-       unregister_hotcpu_notifier(&comp_pool_callback_nb);
-
-       for_each_online_cpu(i)
-               destroy_comp_task(pool, i);
+       smpboot_unregister_percpu_thread(&comp_pool_threads);
  
+       free_percpu(pool->cpu_comp_threads);
         free_percpu(pool->cpu_comp_tasks);
         kfree(pool);
  }
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.h b/drivers/infiniband/hw/ehca/ehca_irq.h

index 3346cb06cea62a4d4a4036364fba3cf174756dda..5370199f08c7bc99211ab31a161887f4372607be 100644 (file)
--- a/drivers/infiniband/hw/ehca/ehca_irq.h
+++ b/drivers/infiniband/hw/ehca/ehca_irq.h
@@ -58,15 +58,15 @@ void ehca_tasklet_eq(unsigned long data);
  void ehca_process_eq(struct ehca_shca *shca, int is_irq);
  
  struct ehca_cpu_comp_task {
-       wait_queue_head_t wait_queue;
         struct list_head cq_list;
-       struct task_struct *task;
         spinlock_t task_lock;
         int cq_jobs;
+       int active;
  };
  
  struct ehca_comp_pool {
-       struct ehca_cpu_comp_task *cpu_comp_tasks;
+       struct ehca_cpu_comp_task __percpu *cpu_comp_tasks;
+       struct task_struct * __percpu *cpu_comp_threads;
         int last_cpu;
         spinlock_t last_cpu_lock;
  };
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h

index 55e6d63d46d0e6593d3753dcd6c572a64cca728d..a52f2f4fe0301d6fcb38c886fb912a1cdc65a79b 100644 (file)
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -10,6 +10,7 @@
  #include <linux/kallsyms.h>
  #include <linux/linkage.h>
  #include <linux/bitops.h>
+#include <linux/ptrace.h>
  #include <linux/ktime.h>
  #include <linux/sched.h>
  #include <linux/types.h>
@@ -18,6 +19,28 @@
  
  #include <asm/ftrace.h>
  
+/*
+ * If the arch supports passing the variable contents of
+ * function_trace_op as the third parameter back from the
+ * mcount call, then the arch should define this as 1.
+ */
+#ifndef ARCH_SUPPORTS_FTRACE_OPS
+#define ARCH_SUPPORTS_FTRACE_OPS 0
+#endif
+
+/*
+ * If the arch's mcount caller does not support all of ftrace's
+ * features, then it must call an indirect function that
+ * does. Or at least does enough to prevent any unwelcomed side effects.
+ */
+#if !defined(CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST) || \
+       !ARCH_SUPPORTS_FTRACE_OPS
+# define FTRACE_FORCE_LIST_FUNC 1
+#else
+# define FTRACE_FORCE_LIST_FUNC 0
+#endif
+
+
  struct module;
  struct ftrace_hash;
  
@@ -29,7 +52,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
                      void __user *buffer, size_t *lenp,
                      loff_t *ppos);
  
-typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip);
+struct ftrace_ops;
+
+typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
+                             struct ftrace_ops *op, struct pt_regs *regs);
  
  /*
   * FTRACE_OPS_FL_* bits denote the state of ftrace_ops struct and are
@@ -45,12 +71,33 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip);
   *           could be controled by following calls:
   *             ftrace_function_local_enable
   *             ftrace_function_local_disable
+ * SAVE_REGS - The ftrace_ops wants regs saved at each function called
+ *            and passed to the callback. If this flag is set, but the
+ *            architecture does not support passing regs
+ *            (ARCH_SUPPORTS_FTRACE_SAVE_REGS is not defined), then the
+ *            ftrace_ops will fail to register, unless the next flag
+ *            is set.
+ * SAVE_REGS_IF_SUPPORTED - This is the same as SAVE_REGS, but if the
+ *            handler can handle an arch that does not save regs
+ *            (the handler tests if regs == NULL), then it can set
+ *            this flag instead. It will not fail registering the ftrace_ops
+ *            but, the regs field will be NULL if the arch does not support
+ *            passing regs to the handler.
+ *            Note, if this flag is set, the SAVE_REGS flag will automatically
+ *            get set upon registering the ftrace_ops, if the arch supports it.
+ * RECURSION_SAFE - The ftrace_ops can set this to tell the ftrace infrastructure
+ *            that the call back has its own recursion protection. If it does
+ *            not set this, then the ftrace infrastructure will add recursion
+ *            protection for the caller.
   */
  enum {
-       FTRACE_OPS_FL_ENABLED           = 1 << 0,
-       FTRACE_OPS_FL_GLOBAL            = 1 << 1,
-       FTRACE_OPS_FL_DYNAMIC           = 1 << 2,
-       FTRACE_OPS_FL_CONTROL           = 1 << 3,
+       FTRACE_OPS_FL_ENABLED                   = 1 << 0,
+       FTRACE_OPS_FL_GLOBAL                    = 1 << 1,
+       FTRACE_OPS_FL_DYNAMIC                   = 1 << 2,
+       FTRACE_OPS_FL_CONTROL                   = 1 << 3,
+       FTRACE_OPS_FL_SAVE_REGS                 = 1 << 4,
+       FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED    = 1 << 5,
+       FTRACE_OPS_FL_RECURSION_SAFE            = 1 << 6,
  };
  
  struct ftrace_ops {
@@ -163,7 +210,8 @@ static inline int ftrace_function_local_disabled(struct ftrace_ops *ops)
         return *this_cpu_ptr(ops->disabled);
  }
  
-extern void ftrace_stub(unsigned long a0, unsigned long a1);
+extern void ftrace_stub(unsigned long a0, unsigned long a1,
+                       struct ftrace_ops *op, struct pt_regs *regs);
  
  #else /* !CONFIG_FUNCTION_TRACER */
  /*
@@ -172,6 +220,10 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1);
   */
  #define register_ftrace_function(ops) ({ 0; })
  #define unregister_ftrace_function(ops) ({ 0; })
+static inline int ftrace_nr_registered_ops(void)
+{
+       return 0;
+}
  static inline void clear_ftrace_function(void) { }
  static inline void ftrace_kill(void) { }
  static inline void ftrace_stop(void) { }
@@ -227,12 +279,33 @@ extern void unregister_ftrace_function_probe_all(char *glob);
  
  extern int ftrace_text_reserved(void *start, void *end);
  
+extern int ftrace_nr_registered_ops(void);
+
+/*
+ * The dyn_ftrace record's flags field is split into two parts.
+ * the first part which is '0-FTRACE_REF_MAX' is a counter of
+ * the number of callbacks that have registered the function that
+ * the dyn_ftrace descriptor represents.
+ *
+ * The second part is a mask:
+ *  ENABLED - the function is being traced
+ *  REGS    - the record wants the function to save regs
+ *  REGS_EN - the function is set up to save regs.
+ *
+ * When a new ftrace_ops is registered and wants a function to save
+ * pt_regs, the rec->flag REGS is set. When the function has been
+ * set up to save regs, the REG_EN flag is set. Once a function
+ * starts saving regs it will do so until all ftrace_ops are removed
+ * from tracing that function.
+ */
  enum {
-       FTRACE_FL_ENABLED       = (1 << 30),
+       FTRACE_FL_ENABLED       = (1UL << 29),
+       FTRACE_FL_REGS          = (1UL << 30),
+       FTRACE_FL_REGS_EN       = (1UL << 31)
  };
  
-#define FTRACE_FL_MASK         (0x3UL << 30)
-#define FTRACE_REF_MAX         ((1 << 30) - 1)
+#define FTRACE_FL_MASK         (0x7UL << 29)
+#define FTRACE_REF_MAX         ((1UL << 29) - 1)
  
  struct dyn_ftrace {
         union {
@@ -244,6 +317,8 @@ struct dyn_ftrace {
  };
  
  int ftrace_force_update(void);
+int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
+                        int remove, int reset);
  int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
                        int len, int reset);
  int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
@@ -263,9 +338,23 @@ enum {
         FTRACE_STOP_FUNC_RET            = (1 << 4),
  };
  
+/*
+ * The FTRACE_UPDATE_* enum is used to pass information back
+ * from the ftrace_update_record() and ftrace_test_record()
+ * functions. These are called by the code update routines
+ * to find out what is to be done for a given function.
+ *
+ *  IGNORE           - The function is already what we want it to be
+ *  MAKE_CALL        - Start tracing the function
+ *  MODIFY_CALL      - Stop saving regs for the function
+ *  MODIFY_CALL_REGS - Start saving regs for the function
+ *  MAKE_NOP         - Stop tracing the function
+ */
  enum {
         FTRACE_UPDATE_IGNORE,
         FTRACE_UPDATE_MAKE_CALL,
+       FTRACE_UPDATE_MODIFY_CALL,
+       FTRACE_UPDATE_MODIFY_CALL_REGS,
         FTRACE_UPDATE_MAKE_NOP,
  };
  
@@ -317,7 +406,9 @@ extern int ftrace_dyn_arch_init(void *data);
  extern void ftrace_replace_code(int enable);
  extern int ftrace_update_ftrace_func(ftrace_func_t func);
  extern void ftrace_caller(void);
+extern void ftrace_regs_caller(void);
  extern void ftrace_call(void);
+extern void ftrace_regs_call(void);
  extern void mcount_call(void);
  
  void ftrace_modify_all_code(int command);
@@ -325,6 +416,15 @@ void ftrace_modify_all_code(int command);
  #ifndef FTRACE_ADDR
  #define FTRACE_ADDR ((unsigned long)ftrace_caller)
  #endif
+
+#ifndef FTRACE_REGS_ADDR
+#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS
+# define FTRACE_REGS_ADDR ((unsigned long)ftrace_regs_caller)
+#else
+# define FTRACE_REGS_ADDR FTRACE_ADDR
+#endif
+#endif
+
  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
  extern void ftrace_graph_caller(void);
  extern int ftrace_enable_ftrace_graph_caller(void);
@@ -380,6 +480,39 @@ extern int ftrace_make_nop(struct module *mod,
   */
  extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
  
+#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS
+/**
+ * ftrace_modify_call - convert from one addr to another (no nop)
+ * @rec: the mcount call site record
+ * @old_addr: the address expected to be currently called to
+ * @addr: the address to change to
+ *
+ * This is a very sensitive operation and great care needs
+ * to be taken by the arch.  The operation should carefully
+ * read the location, check to see if what is read is indeed
+ * what we expect it to be, and then on success of the compare,
+ * it should write to the location.
+ *
+ * The code segment at @rec->ip should be a caller to @old_addr
+ *
+ * Return must be:
+ *  0 on success
+ *  -EFAULT on error reading the location
+ *  -EINVAL on a failed compare of the contents
+ *  -EPERM  on error writing to the location
+ * Any other value will be considered a failure.
+ */
+extern int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+                             unsigned long addr);
+#else
+/* Should never be called */
+static inline int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+                                    unsigned long addr)
+{
+       return -EINVAL;
+}
+#endif
+
  /* May be defined in arch */
  extern int ftrace_arch_read_dyn_info(char *buf, int size);
  
@@ -387,7 +520,7 @@ extern int skip_trace(unsigned long ip);
  
  extern void ftrace_disable_daemon(void);
  extern void ftrace_enable_daemon(void);
-#else
+#else /* CONFIG_DYNAMIC_FTRACE */
  static inline int skip_trace(unsigned long ip) { return 0; }
  static inline int ftrace_force_update(void) { return 0; }
  static inline void ftrace_disable_daemon(void) { }
@@ -405,6 +538,10 @@ static inline int ftrace_text_reserved(void *start, void *end)
  {
         return 0;
  }
+static inline unsigned long ftrace_location(unsigned long ip)
+{
+       return 0;
+}
  
  /*
   * Again users of functions that have ftrace_ops may not
@@ -413,6 +550,7 @@ static inline int ftrace_text_reserved(void *start, void *end)
   */
  #define ftrace_regex_open(ops, flag, inod, file) ({ -ENODEV; })
  #define ftrace_set_early_filter(ops, buf, enable) do { } while (0)
+#define ftrace_set_filter_ip(ops, ip, remove, reset) ({ -ENODEV; })
  #define ftrace_set_filter(ops, buf, len, reset) ({ -ENODEV; })
  #define ftrace_set_notrace(ops, buf, len, reset) ({ -ENODEV; })
  #define ftrace_free_filter(ops) do { } while (0)
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h

index 2fbd9053c2df6477a425567c49014ea3328f9038..bbe5d15d6597113c4e44b8180b173f53269ebe97 100644 (file)
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -130,4 +130,10 @@ extern void account_process_tick(struct task_struct *, int user);
  extern void account_steal_ticks(unsigned long ticks);
  extern void account_idle_ticks(unsigned long ticks);
  
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+extern void account_switch_vtime(struct task_struct *prev);
+#else
+static inline void account_switch_vtime(struct task_struct *prev) { }
+#endif
+
  #endif /* _LINUX_KERNEL_STAT_H */
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h

index b6e1f8c00577151bf937a1f2dc5807b53decf64b..23755ba42abc456a06ffd663d0b3817570c1e7cb 100644 (file)
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -38,6 +38,7 @@
  #include <linux/spinlock.h>
  #include <linux/rcupdate.h>
  #include <linux/mutex.h>
+#include <linux/ftrace.h>
  
  #ifdef CONFIG_KPROBES
  #include <asm/kprobes.h>
@@ -48,14 +49,26 @@
  #define KPROBE_REENTER         0x00000004
  #define KPROBE_HIT_SSDONE      0x00000008
  
+/*
+ * If function tracer is enabled and the arch supports full
+ * passing of pt_regs to function tracing, then kprobes can
+ * optimize on top of function tracing.
+ */
+#if defined(CONFIG_FUNCTION_TRACER) && defined(ARCH_SUPPORTS_FTRACE_SAVE_REGS) \
+       && defined(ARCH_SUPPORTS_KPROBES_ON_FTRACE)
+# define KPROBES_CAN_USE_FTRACE
+#endif
+
  /* Attach to insert probes on any functions which should be ignored*/
  #define __kprobes      __attribute__((__section__(".kprobes.text")))
+
  #else /* CONFIG_KPROBES */
  typedef int kprobe_opcode_t;
  struct arch_specific_insn {
         int dummy;
  };
  #define __kprobes
+
  #endif /* CONFIG_KPROBES */
  
  struct kprobe;
@@ -128,6 +141,7 @@ struct kprobe {
                                    * NOTE:
                                    * this flag is only for optimized_kprobe.
                                    */
+#define KPROBE_FLAG_FTRACE     8 /* probe is using ftrace */
  
  /* Has this kprobe gone ? */
  static inline int kprobe_gone(struct kprobe *p)
@@ -146,6 +160,13 @@ static inline int kprobe_optimized(struct kprobe *p)
  {
         return p->flags & KPROBE_FLAG_OPTIMIZED;
  }
+
+/* Is this kprobe uses ftrace ? */
+static inline int kprobe_ftrace(struct kprobe *p)
+{
+       return p->flags & KPROBE_FLAG_FTRACE;
+}
+
  /*
   * Special probe type that uses setjmp-longjmp type tricks to resume
   * execution at a specified entry with a matching prototype corresponding
@@ -295,6 +316,12 @@ extern int proc_kprobes_optimization_handler(struct ctl_table *table,
  #endif
  
  #endif /* CONFIG_OPTPROBES */
+#ifdef KPROBES_CAN_USE_FTRACE
+extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
+                                 struct ftrace_ops *ops, struct pt_regs *regs);
+extern int arch_prepare_kprobe_ftrace(struct kprobe *p);
+#endif
+
  
  /* Get the kprobe at this addr (if any) - called with preemption disabled */
  struct kprobe *get_kprobe(void *addr);
diff --git a/include/linux/kthread.h b/include/linux/kthread.h

index 22ccf9dee177dcfe7ec6b91eeb6b5c6867fc4903..8d816646f7665eab86dd9fbd10adf8256c624c9e 100644 (file)
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -14,6 +14,11 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
         kthread_create_on_node(threadfn, data, -1, namefmt, ##arg)
  
  
+struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
+                                         void *data,
+                                         unsigned int cpu,
+                                         const char *namefmt);
+
  /**
   * kthread_run - create and wake a thread.
   * @threadfn: the function to run until signal_pending(current).
@@ -34,9 +39,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
  
  void kthread_bind(struct task_struct *k, unsigned int cpu);
  int kthread_stop(struct task_struct *k);
-int kthread_should_stop(void);
+bool kthread_should_stop(void);
+bool kthread_should_park(void);
  bool kthread_freezable_should_stop(bool *was_frozen);
  void *kthread_data(struct task_struct *k);
+int kthread_park(struct task_struct *k);
+void kthread_unpark(struct task_struct *k);
+void kthread_parkme(void);
  
  int kthreadd(void *unused);
  extern struct task_struct *kthreadd_task;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h

index ad04dfcd6f35a85dbd85047b1a52d7019d0a0a36..cc5e2cd2babb9c3879fc21f25079dd994bd54c3b 100644 (file)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -130,8 +130,10 @@ enum perf_event_sample_format {
         PERF_SAMPLE_STREAM_ID                   = 1U << 9,
         PERF_SAMPLE_RAW                         = 1U << 10,
         PERF_SAMPLE_BRANCH_STACK                = 1U << 11,
+       PERF_SAMPLE_REGS_USER                   = 1U << 12,
+       PERF_SAMPLE_STACK_USER                  = 1U << 13,
  
-       PERF_SAMPLE_MAX = 1U << 12,             /* non-ABI */
+       PERF_SAMPLE_MAX = 1U << 14,             /* non-ABI */
  };
  
  /*
@@ -162,6 +164,15 @@ enum perf_branch_sample_type {
          PERF_SAMPLE_BRANCH_KERNEL|\
          PERF_SAMPLE_BRANCH_HV)
  
+/*
+ * Values to determine ABI of the registers dump.
+ */
+enum perf_sample_regs_abi {
+       PERF_SAMPLE_REGS_ABI_NONE       = 0,
+       PERF_SAMPLE_REGS_ABI_32         = 1,
+       PERF_SAMPLE_REGS_ABI_64         = 2,
+};
+
  /*
   * The format of the data returned by read() on a perf event fd,
   * as specified by attr.read_format:
@@ -194,6 +205,8 @@ enum perf_event_read_format {
  #define PERF_ATTR_SIZE_VER0    64      /* sizeof first published struct */
  #define PERF_ATTR_SIZE_VER1    72      /* add: config2 */
  #define PERF_ATTR_SIZE_VER2    80      /* add: branch_sample_type */
+#define PERF_ATTR_SIZE_VER3    96      /* add: sample_regs_user */
+                                       /* add: sample_stack_user */
  
  /*
   * Hardware event_id to monitor via a performance monitoring event:
@@ -255,7 +268,10 @@ struct perf_event_attr {
                                 exclude_host   :  1, /* don't count in host   */
                                 exclude_guest  :  1, /* don't count in guest  */
  
-                               __reserved_1   : 43;
+                               exclude_callchain_kernel : 1, /* exclude kernel callchains */
+                               exclude_callchain_user   : 1, /* exclude user callchains */
+
+                               __reserved_1   : 41;
  
         union {
                 __u32           wakeup_events;    /* wakeup every n events */
@@ -271,7 +287,21 @@ struct perf_event_attr {
                 __u64           bp_len;
                 __u64           config2; /* extension of config1 */
         };
-       __u64   branch_sample_type; /* enum branch_sample_type */
+       __u64   branch_sample_type; /* enum perf_branch_sample_type */
+
+       /*
+        * Defines set of user regs to dump on samples.
+        * See asm/perf_regs.h for details.
+        */
+       __u64   sample_regs_user;
+
+       /*
+        * Defines size of the user stack to dump on samples.
+        */
+       __u32   sample_stack_user;
+
+       /* Align to u64. */
+       __u32   __reserved_2;
  };
  
  /*
@@ -548,6 +578,13 @@ enum perf_event_type {
          *        char                  data[size];}&& PERF_SAMPLE_RAW
          *
          *      { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK
+        *
+        *      { u64                   abi; # enum perf_sample_regs_abi
+        *        u64                   regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
+        *
+        *      { u64                   size;
+        *        char                  data[size];
+        *        u64                   dyn_size; } && PERF_SAMPLE_STACK_USER
          * };
          */
         PERF_RECORD_SAMPLE                      = 9,
@@ -609,6 +646,7 @@ struct perf_guest_info_callbacks {
  #include <linux/static_key.h>
  #include <linux/atomic.h>
  #include <linux/sysfs.h>
+#include <linux/perf_regs.h>
  #include <asm/local.h>
  
  struct perf_callchain_entry {
@@ -654,6 +692,11 @@ struct perf_branch_stack {
         struct perf_branch_entry        entries[0];
  };
  
+struct perf_regs_user {
+       __u64           abi;
+       struct pt_regs  *regs;
+};
+
  struct task_struct;
  
  /*
@@ -1133,6 +1176,8 @@ struct perf_sample_data {
         struct perf_callchain_entry     *callchain;
         struct perf_raw_record          *raw;
         struct perf_branch_stack        *br_stack;
+       struct perf_regs_user           regs_user;
+       u64                             stack_user_size;
  };
  
  static inline void perf_sample_data_init(struct perf_sample_data *data,
@@ -1142,7 +1187,10 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
         data->addr = addr;
         data->raw  = NULL;
         data->br_stack = NULL;
-       data->period    = period;
+       data->period = period;
+       data->regs_user.abi = PERF_SAMPLE_REGS_ABI_NONE;
+       data->regs_user.regs = NULL;
+       data->stack_user_size = 0;
  }
  
  extern void perf_output_sample(struct perf_output_handle *handle,
@@ -1290,12 +1338,15 @@ static inline bool has_branch_stack(struct perf_event *event)
  extern int perf_output_begin(struct perf_output_handle *handle,
                              struct perf_event *event, unsigned int size);
  extern void perf_output_end(struct perf_output_handle *handle);
-extern void perf_output_copy(struct perf_output_handle *handle,
+extern unsigned int perf_output_copy(struct perf_output_handle *handle,
                              const void *buf, unsigned int len);
+extern unsigned int perf_output_skip(struct perf_output_handle *handle,
+                                    unsigned int len);
  extern int perf_swevent_get_recursion_context(void);
  extern void perf_swevent_put_recursion_context(int rctx);
  extern void perf_event_enable(struct perf_event *event);
  extern void perf_event_disable(struct perf_event *event);
+extern int __perf_event_disable(void *info);
  extern void perf_event_task_tick(void);
  #else
  static inline void
@@ -1334,6 +1385,7 @@ static inline int  perf_swevent_get_recursion_context(void)               { return -1; }
  static inline void perf_swevent_put_recursion_context(int rctx)                { }
  static inline void perf_event_enable(struct perf_event *event)         { }
  static inline void perf_event_disable(struct perf_event *event)                { }
+static inline int __perf_event_disable(void *info)                     { return -1; }
  static inline void perf_event_task_tick(void)                          { }
  #endif
  
diff --git a/include/linux/perf_regs.h b/include/linux/perf_regs.h

new file mode 100644 (file)

index 0000000..3c73d5f
--- /dev/null
+++ b/include/linux/perf_regs.h
@@ -0,0 +1,25 @@
+#ifndef _LINUX_PERF_REGS_H
+#define _LINUX_PERF_REGS_H
+
+#ifdef CONFIG_HAVE_PERF_REGS
+#include <asm/perf_regs.h>
+u64 perf_reg_value(struct pt_regs *regs, int idx);
+int perf_reg_validate(u64 mask);
+u64 perf_reg_abi(struct task_struct *task);
+#else
+static inline u64 perf_reg_value(struct pt_regs *regs, int idx)
+{
+       return 0;
+}
+
+static inline int perf_reg_validate(u64 mask)
+{
+       return mask ? -ENOSYS : 0;
+}
+
+static inline u64 perf_reg_abi(struct task_struct *task)
+{
+       return PERF_SAMPLE_REGS_ABI_NONE;
+}
+#endif /* CONFIG_HAVE_PERF_REGS */
+#endif /* _LINUX_PERF_REGS_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h

index b8c86648a2f95dc6f83aab668fd9a7e07f277b4e..afbef50a77e77898cd438cb9d7be63a82f70038c 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -446,6 +446,8 @@ extern int get_dumpable(struct mm_struct *mm);
  #define MMF_VM_HUGEPAGE                17      /* set when VM_HUGEPAGE is set on vma */
  #define MMF_EXE_FILE_CHANGED   18      /* see prctl_set_mm_exe_file() */
  
+#define MMF_HAS_UPROBES                19      /* might have uprobes */
+
  #define MMF_INIT_MASK          (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
  
  struct sighand_struct {
@@ -860,7 +862,6 @@ enum cpu_idle_type {
  #define SD_BALANCE_FORK                0x0008  /* Balance on fork, clone */
  #define SD_BALANCE_WAKE                0x0010  /* Balance on wakeup */
  #define SD_WAKE_AFFINE         0x0020  /* Wake task to waking CPU */
-#define SD_PREFER_LOCAL                0x0040  /* Prefer to keep tasks local to this domain */
  #define SD_SHARE_CPUPOWER      0x0080  /* Domain members share cpu power */
  #define SD_SHARE_PKG_RESOURCES 0x0200  /* Domain members share cpu pkg resources */
  #define SD_SERIALIZE           0x0400  /* Only a single load balancing instance */
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h

new file mode 100644 (file)

index 0000000..e0106d8
--- /dev/null
+++ b/include/linux/smpboot.h
@@ -0,0 +1,43 @@
+#ifndef _LINUX_SMPBOOT_H
+#define _LINUX_SMPBOOT_H
+
+#include <linux/types.h>
+
+struct task_struct;
+/* Cookie handed to the thread_fn*/
+struct smpboot_thread_data;
+
+/**
+ * struct smp_hotplug_thread - CPU hotplug related thread descriptor
+ * @store:             Pointer to per cpu storage for the task pointers
+ * @list:              List head for core management
+ * @thread_should_run: Check whether the thread should run or not. Called with
+ *                     preemption disabled.
+ * @thread_fn:         The associated thread function
+ * @setup:             Optional setup function, called when the thread gets
+ *                     operational the first time
+ * @cleanup:           Optional cleanup function, called when the thread
+ *                     should stop (module exit)
+ * @park:              Optional park function, called when the thread is
+ *                     parked (cpu offline)
+ * @unpark:            Optional unpark function, called when the thread is
+ *                     unparked (cpu online)
+ * @thread_comm:       The base name of the thread
+ */
+struct smp_hotplug_thread {
+       struct task_struct __percpu     **store;
+       struct list_head                list;
+       int                             (*thread_should_run)(unsigned int cpu);
+       void                            (*thread_fn)(unsigned int cpu);
+       void                            (*setup)(unsigned int cpu);
+       void                            (*cleanup)(unsigned int cpu, bool online);
+       void                            (*park)(unsigned int cpu);
+       void                            (*unpark)(unsigned int cpu);
+       const char                      *thread_comm;
+};
+
+int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
+void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
+int smpboot_thread_schedule(void);
+
+#endif
diff --git a/include/linux/timer.h b/include/linux/timer.h

index 6abd9138beda57f7555b96b9fa0d51c60edaa50f..8c5a197e1587de4c647ff205b5b591c31a0dbcc6 100644 (file)
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -49,147 +49,112 @@ extern struct tvec_base boot_tvec_bases;
  #endif
  
  /*
- * Note that all tvec_bases are 2 byte aligned and lower bit of
- * base in timer_list is guaranteed to be zero. Use the LSB to
- * indicate whether the timer is deferrable.
+ * Note that all tvec_bases are at least 4 byte aligned and lower two bits
+ * of base in timer_list is guaranteed to be zero. Use them for flags.
   *
   * A deferrable timer will work normally when the system is busy, but
   * will not cause a CPU to come out of idle just to service it; instead,
   * the timer will be serviced when the CPU eventually wakes up with a
   * subsequent non-deferrable timer.
+ *
+ * An irqsafe timer is executed with IRQ disabled and it's safe to wait for
+ * the completion of the running instance from IRQ handlers, for example,
+ * by calling del_timer_sync().
+ *
+ * Note: The irq disabled callback execution is a special case for
+ * workqueue locking issues. It's not meant for executing random crap
+ * with interrupts disabled. Abuse is monitored!
   */
-#define TBASE_DEFERRABLE_FLAG          (0x1)
+#define TIMER_DEFERRABLE               0x1LU
+#define TIMER_IRQSAFE                  0x2LU
  
-#define TIMER_INITIALIZER(_function, _expires, _data) {                \
+#define TIMER_FLAG_MASK                        0x3LU
+
+#define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
                 .entry = { .prev = TIMER_ENTRY_STATIC },        \
                 .function = (_function),                        \
                 .expires = (_expires),                          \
                 .data = (_data),                                \
-               .base = &boot_tvec_bases,                       \
+               .base = (void *)((unsigned long)&boot_tvec_bases + (_flags)), \
                 .slack = -1,                                    \
                 __TIMER_LOCKDEP_MAP_INITIALIZER(                \
                         __FILE__ ":" __stringify(__LINE__))     \
         }
  
-#define TBASE_MAKE_DEFERRED(ptr) ((struct tvec_base *)         \
-                 ((unsigned char *)(ptr) + TBASE_DEFERRABLE_FLAG))
+#define TIMER_INITIALIZER(_function, _expires, _data)          \
+       __TIMER_INITIALIZER((_function), (_expires), (_data), 0)
  
-#define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data) {\
-               .entry = { .prev = TIMER_ENTRY_STATIC },        \
-               .function = (_function),                        \
-               .expires = (_expires),                          \
-               .data = (_data),                                \
-               .base = TBASE_MAKE_DEFERRED(&boot_tvec_bases),  \
-               __TIMER_LOCKDEP_MAP_INITIALIZER(                \
-                       __FILE__ ":" __stringify(__LINE__))     \
-       }
+#define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data) \
+       __TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE)
  
  #define DEFINE_TIMER(_name, _function, _expires, _data)                \
         struct timer_list _name =                               \
                 TIMER_INITIALIZER(_function, _expires, _data)
  
-void init_timer_key(struct timer_list *timer,
-                   const char *name,
-                   struct lock_class_key *key);
-void init_timer_deferrable_key(struct timer_list *timer,
-                              const char *name,
-                              struct lock_class_key *key);
+void init_timer_key(struct timer_list *timer, unsigned int flags,
+                   const char *name, struct lock_class_key *key);
+
+#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
+extern void init_timer_on_stack_key(struct timer_list *timer,
+                                   unsigned int flags, const char *name,
+                                   struct lock_class_key *key);
+extern void destroy_timer_on_stack(struct timer_list *timer);
+#else
+static inline void destroy_timer_on_stack(struct timer_list *timer) { }
+static inline void init_timer_on_stack_key(struct timer_list *timer,
+                                          unsigned int flags, const char *name,
+                                          struct lock_class_key *key)
+{
+       init_timer_key(timer, flags, name, key);
+}
+#endif
  
  #ifdef CONFIG_LOCKDEP
-#define init_timer(timer)                                              \
+#define __init_timer(_timer, _flags)                                   \
         do {                                                            \
                 static struct lock_class_key __key;                     \
-               init_timer_key((timer), #timer, &__key);                \
+               init_timer_key((_timer), (_flags), #_timer, &__key);    \
         } while (0)
  
-#define init_timer_deferrable(timer)                                   \
+#define __init_timer_on_stack(_timer, _flags)                          \
         do {                                                            \
                 static struct lock_class_key __key;                     \
-               init_timer_deferrable_key((timer), #timer, &__key);     \
+               init_timer_on_stack_key((_timer), (_flags), #_timer, &__key); \
         } while (0)
+#else
+#define __init_timer(_timer, _flags)                                   \
+       init_timer_key((_timer), (_flags), NULL, NULL)
+#define __init_timer_on_stack(_timer, _flags)                          \
+       init_timer_on_stack_key((_timer), (_flags), NULL, NULL)
+#endif
  
+#define init_timer(timer)                                              \
+       __init_timer((timer), 0)
+#define init_timer_deferrable(timer)                                   \
+       __init_timer((timer), TIMER_DEFERRABLE)
  #define init_timer_on_stack(timer)                                     \
+       __init_timer_on_stack((timer), 0)
+
+#define __setup_timer(_timer, _fn, _data, _flags)                      \
         do {                                                            \
-               static struct lock_class_key __key;                     \
-               init_timer_on_stack_key((timer), #timer, &__key);       \
+               __init_timer((_timer), (_flags));                       \
+               (_timer)->function = (_fn);                             \
+               (_timer)->data = (_data);                               \
         } while (0)
  
-#define setup_timer(timer, fn, data)                                   \
+#define __setup_timer_on_stack(_timer, _fn, _data, _flags)             \
         do {                                                            \
-               static struct lock_class_key __key;                     \
-               setup_timer_key((timer), #timer, &__key, (fn), (data));\
+               __init_timer_on_stack((_timer), (_flags));              \
+               (_timer)->function = (_fn);                             \
+               (_timer)->data = (_data);                               \
         } while (0)
  
+#define setup_timer(timer, fn, data)                                   \
+       __setup_timer((timer), (fn), (data), 0)
  #define setup_timer_on_stack(timer, fn, data)                          \
-       do {                                                            \
-               static struct lock_class_key __key;                     \
-               setup_timer_on_stack_key((timer), #timer, &__key,       \
-                                        (fn), (data));                 \
-       } while (0)
+       __setup_timer_on_stack((timer), (fn), (data), 0)
  #define setup_deferrable_timer_on_stack(timer, fn, data)               \
-       do {                                                            \
-               static struct lock_class_key __key;                     \
-               setup_deferrable_timer_on_stack_key((timer), #timer,    \
-                                                   &__key, (fn),       \
-                                                   (data));            \
-       } while (0)
-#else
-#define init_timer(timer)\
-       init_timer_key((timer), NULL, NULL)
-#define init_timer_deferrable(timer)\
-       init_timer_deferrable_key((timer), NULL, NULL)
-#define init_timer_on_stack(timer)\
-       init_timer_on_stack_key((timer), NULL, NULL)
-#define setup_timer(timer, fn, data)\
-       setup_timer_key((timer), NULL, NULL, (fn), (data))
-#define setup_timer_on_stack(timer, fn, data)\
-       setup_timer_on_stack_key((timer), NULL, NULL, (fn), (data))
-#define setup_deferrable_timer_on_stack(timer, fn, data)\
-       setup_deferrable_timer_on_stack_key((timer), NULL, NULL, (fn), (data))
-#endif
-
-#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
-extern void init_timer_on_stack_key(struct timer_list *timer,
-                                   const char *name,
-                                   struct lock_class_key *key);
-extern void destroy_timer_on_stack(struct timer_list *timer);
-#else
-static inline void destroy_timer_on_stack(struct timer_list *timer) { }
-static inline void init_timer_on_stack_key(struct timer_list *timer,
-                                          const char *name,
-                                          struct lock_class_key *key)
-{
-       init_timer_key(timer, name, key);
-}
-#endif
-
-static inline void setup_timer_key(struct timer_list * timer,
-                               const char *name,
-                               struct lock_class_key *key,
-                               void (*function)(unsigned long),
-                               unsigned long data)
-{
-       timer->function = function;
-       timer->data = data;
-       init_timer_key(timer, name, key);
-}
-
-static inline void setup_timer_on_stack_key(struct timer_list *timer,
-                                       const char *name,
-                                       struct lock_class_key *key,
-                                       void (*function)(unsigned long),
-                                       unsigned long data)
-{
-       timer->function = function;
-       timer->data = data;
-       init_timer_on_stack_key(timer, name, key);
-}
-
-extern void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
-                                               const char *name,
-                                               struct lock_class_key *key,
-                                               void (*function)(unsigned long),
-                                               unsigned long data);
+       __setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE)
  
  /**
   * timer_pending - is a timer pending?
diff --git a/include/linux/topology.h b/include/linux/topology.h

index fec12d667211dd398ba07ed5127b9e3485bcdd49..d3cf0d6e7712c115fb529559b59c2ae4f6f8cd9c 100644 (file)
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -129,7 +129,6 @@ int arch_update_cpu_topology(void);
                                 | 1*SD_BALANCE_FORK                     \
                                 | 0*SD_BALANCE_WAKE                     \
                                 | 1*SD_WAKE_AFFINE                      \
-                               | 0*SD_PREFER_LOCAL                     \
                                 | 0*SD_SHARE_CPUPOWER                   \
                                 | 1*SD_SHARE_PKG_RESOURCES              \
                                 | 0*SD_SERIALIZE                        \
@@ -160,7 +159,6 @@ int arch_update_cpu_topology(void);
                                 | 1*SD_BALANCE_FORK                     \
                                 | 0*SD_BALANCE_WAKE                     \
                                 | 1*SD_WAKE_AFFINE                      \
-                               | 0*SD_PREFER_LOCAL                     \
                                 | 0*SD_SHARE_CPUPOWER                   \
                                 | 0*SD_SHARE_PKG_RESOURCES              \
                                 | 0*SD_SERIALIZE                        \
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h

index efe4b3308c74cfc9f2140e07f876ae77e2e9182b..6d4fe79a1a6af97eb256fac3792a39b5c00c8050 100644 (file)
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -99,15 +99,16 @@ struct xol_area {
  
  struct uprobes_state {
         struct xol_area         *xol_area;
-       atomic_t                count;
  };
+
  extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
-extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm,  unsigned long vaddr, bool verify);
+extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
  extern bool __weak is_swbp_insn(uprobe_opcode_t *insn);
  extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
  extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
  extern int uprobe_mmap(struct vm_area_struct *vma);
  extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end);
+extern void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm);
  extern void uprobe_free_utask(struct task_struct *t);
  extern void uprobe_copy_process(struct task_struct *t);
  extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs);
@@ -117,7 +118,6 @@ extern void uprobe_notify_resume(struct pt_regs *regs);
  extern bool uprobe_deny_signal(void);
  extern bool __weak arch_uprobe_skip_sstep(struct arch_uprobe *aup, struct pt_regs *regs);
  extern void uprobe_clear_state(struct mm_struct *mm);
-extern void uprobe_reset_state(struct mm_struct *mm);
  #else /* !CONFIG_UPROBES */
  struct uprobes_state {
  };
@@ -138,6 +138,10 @@ static inline void
  uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
  {
  }
+static inline void
+uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+}
  static inline void uprobe_notify_resume(struct pt_regs *regs)
  {
  }
@@ -158,8 +162,5 @@ static inline void uprobe_copy_process(struct task_struct *t)
  static inline void uprobe_clear_state(struct mm_struct *mm)
  {
  }
-static inline void uprobe_reset_state(struct mm_struct *mm)
-{
-}
  #endif /* !CONFIG_UPROBES */
  #endif /* _LINUX_UPROBES_H */
diff --git a/init/Kconfig b/init/Kconfig

index b445d6f49bcf4a62bb3db54e3b9b7497a9672f68..b14150aec6005268d1910ea844a4d767334b0f79 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -267,6 +267,19 @@ config POSIX_MQUEUE_SYSCTL
         depends on SYSCTL
         default y
  
+config VIRT_CPU_ACCOUNTING
+       bool "Deterministic task and CPU time accounting"
+       depends on HAVE_VIRT_CPU_ACCOUNTING
+       default y if PPC64
+       help
+         Select this option to enable more accurate task and CPU time
+         accounting.  This is done by reading a CPU counter on each
+         kernel entry and exit and on transitions within the kernel
+         between system, softirq and hardirq state, so there is a
+         small performance impact.  In the case of s390 or IBM POWER > 5,
+         this also enables accounting of stolen time on logically-partitioned
+         systems.
+
  config BSD_PROCESS_ACCT
         bool "BSD Process Accounting"
         help
diff --git a/kernel/Makefile b/kernel/Makefile

index c0cc67ad764ceddbe9f226ee1bfb90c4055f19ff..e5602d32acb33f1af7200eb8754e367c16339cbd 100644 (file)
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = fork.o exec_domain.o panic.o printk.o \
             kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
             hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
             notifier.o ksysfs.o cred.o \
-           async.o range.o groups.o lglock.o
+           async.o range.o groups.o lglock.o smpboot.o
  
  ifdef CONFIG_FUNCTION_TRACER
  # Do not trace debug files and internal ftrace files
@@ -46,7 +46,6 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
  obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
  obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
  obj-$(CONFIG_SMP) += smp.o
-obj-$(CONFIG_SMP) += smpboot.o
  ifneq ($(CONFIG_SMP),y)
  obj-y += up.o
  endif
diff --git a/kernel/cpu.c b/kernel/cpu.c

index 14d32588cccdb3c6d2d64dcd451b5f2efa8e9a19..f560598807c150dd3b919e2688a78865e0b6f4df 100644 (file)
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -280,12 +280,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
                                 __func__, cpu);
                 goto out_release;
         }
+       smpboot_park_threads(cpu);
  
         err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
         if (err) {
                 /* CPU didn't die: tell everyone.  Can't complain. */
+               smpboot_unpark_threads(cpu);
                 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
-
                 goto out_release;
         }
         BUG_ON(cpu_online(cpu));
@@ -354,6 +355,10 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
                 goto out;
         }
  
+       ret = smpboot_create_threads(cpu);
+       if (ret)
+               goto out;
+
         ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
         if (ret) {
                 nr_calls--;
@@ -368,6 +373,9 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
                 goto out_notify;
         BUG_ON(!cpu_online(cpu));
  
+       /* Wake the per cpu threads */
+       smpboot_unpark_threads(cpu);
+
         /* Now call notifier in preparation. */
         cpu_notify(CPU_ONLINE | mod, hcpu);
  
@@ -439,14 +447,6 @@ EXPORT_SYMBOL_GPL(cpu_up);
  #ifdef CONFIG_PM_SLEEP_SMP
  static cpumask_var_t frozen_cpus;
  
-void __weak arch_disable_nonboot_cpus_begin(void)
-{
-}
-
-void __weak arch_disable_nonboot_cpus_end(void)
-{
-}
-
  int disable_nonboot_cpus(void)
  {
         int cpu, first_cpu, error = 0;
@@ -458,7 +458,6 @@ int disable_nonboot_cpus(void)
          * with the userspace trying to use the CPU hotplug at the same time
          */
         cpumask_clear(frozen_cpus);
-       arch_disable_nonboot_cpus_begin();
  
         printk("Disabling non-boot CPUs ...\n");
         for_each_online_cpu(cpu) {
@@ -474,8 +473,6 @@ int disable_nonboot_cpus(void)
                 }
         }
  
-       arch_disable_nonboot_cpus_end();
-
         if (!error) {
                 BUG_ON(num_online_cpus() > 1);
                 /* Make sure the CPUs won't be enabled by someone else */
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c

index 98d4597f43d69e35a76535fac7d0bb6821ebdf7f..c77206184b8bd2c4c16cd981b9a383076f840b25 100644 (file)
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -159,6 +159,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
         int rctx;
         struct perf_callchain_entry *entry;
  
+       int kernel = !event->attr.exclude_callchain_kernel;
+       int user   = !event->attr.exclude_callchain_user;
+
+       if (!kernel && !user)
+               return NULL;
  
         entry = get_callchain_entry(&rctx);
         if (rctx == -1)
@@ -169,24 +174,29 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
  
         entry->nr = 0;
  
-       if (!user_mode(regs)) {
+       if (kernel && !user_mode(regs)) {
                 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
                 perf_callchain_kernel(entry, regs);
-               if (current->mm)
-                       regs = task_pt_regs(current);
-               else
-                       regs = NULL;
         }
  
-       if (regs) {
-               /*
-                * Disallow cross-task user callchains.
-                */
-               if (event->ctx->task && event->ctx->task != current)
-                       goto exit_put;
-
-               perf_callchain_store(entry, PERF_CONTEXT_USER);
-               perf_callchain_user(entry, regs);
+       if (user) {
+               if (!user_mode(regs)) {
+                       if  (current->mm)
+                               regs = task_pt_regs(current);
+                       else
+                               regs = NULL;
+               }
+
+               if (regs) {
+                       /*
+                        * Disallow cross-task user callchains.
+                        */
+                       if (event->ctx->task && event->ctx->task != current)
+                               goto exit_put;
+
+                       perf_callchain_store(entry, PERF_CONTEXT_USER);
+                       perf_callchain_user(entry, regs);
+               }
         }
  
  exit_put:
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 647eb2bc2ebb5d1a01a2bdd4d549451826be25f7..8b656b98913c01bf5c29cc74346aa8b0bb703f36 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,6 +36,7 @@
  #include <linux/perf_event.h>
  #include <linux/ftrace_event.h>
  #include <linux/hw_breakpoint.h>
+#include <linux/mm_types.h>
  
  #include "internal.h"
  
@@ -1253,7 +1254,7 @@ retry:
  /*
   * Cross CPU call to disable a performance event
   */
-static int __perf_event_disable(void *info)
+int __perf_event_disable(void *info)
  {
         struct perf_event *event = info;
         struct perf_event_context *ctx = event->ctx;
@@ -3764,6 +3765,132 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
  }
  EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
  
+static void
+perf_output_sample_regs(struct perf_output_handle *handle,
+                       struct pt_regs *regs, u64 mask)
+{
+       int bit;
+
+       for_each_set_bit(bit, (const unsigned long *) &mask,
+                        sizeof(mask) * BITS_PER_BYTE) {
+               u64 val;
+
+               val = perf_reg_value(regs, bit);
+               perf_output_put(handle, val);
+       }
+}
+
+static void perf_sample_regs_user(struct perf_regs_user *regs_user,
+                                 struct pt_regs *regs)
+{
+       if (!user_mode(regs)) {
+               if (current->mm)
+                       regs = task_pt_regs(current);
+               else
+                       regs = NULL;
+       }
+
+       if (regs) {
+               regs_user->regs = regs;
+               regs_user->abi  = perf_reg_abi(current);
+       }
+}
+
+/*
+ * Get remaining task size from user stack pointer.
+ *
+ * It'd be better to take stack vma map and limit this more
+ * precisly, but there's no way to get it safely under interrupt,
+ * so using TASK_SIZE as limit.
+ */
+static u64 perf_ustack_task_size(struct pt_regs *regs)
+{
+       unsigned long addr = perf_user_stack_pointer(regs);
+
+       if (!addr || addr >= TASK_SIZE)
+               return 0;
+
+       return TASK_SIZE - addr;
+}
+
+static u16
+perf_sample_ustack_size(u16 stack_size, u16 header_size,
+                       struct pt_regs *regs)
+{
+       u64 task_size;
+
+       /* No regs, no stack pointer, no dump. */
+       if (!regs)
+               return 0;
+
+       /*
+        * Check if we fit in with the requested stack size into the:
+        * - TASK_SIZE
+        *   If we don't, we limit the size to the TASK_SIZE.
+        *
+        * - remaining sample size
+        *   If we don't, we customize the stack size to
+        *   fit in to the remaining sample size.
+        */
+
+       task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
+       stack_size = min(stack_size, (u16) task_size);
+
+       /* Current header size plus static size and dynamic size. */
+       header_size += 2 * sizeof(u64);
+
+       /* Do we fit in with the current stack dump size? */
+       if ((u16) (header_size + stack_size) < header_size) {
+               /*
+                * If we overflow the maximum size for the sample,
+                * we customize the stack dump size to fit in.
+                */
+               stack_size = USHRT_MAX - header_size - sizeof(u64);
+               stack_size = round_up(stack_size, sizeof(u64));
+       }
+
+       return stack_size;
+}
+
+static void
+perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
+                         struct pt_regs *regs)
+{
+       /* Case of a kernel thread, nothing to dump */
+       if (!regs) {
+               u64 size = 0;
+               perf_output_put(handle, size);
+       } else {
+               unsigned long sp;
+               unsigned int rem;
+               u64 dyn_size;
+
+               /*
+                * We dump:
+                * static size
+                *   - the size requested by user or the best one we can fit
+                *     in to the sample max size
+                * data
+                *   - user stack dump data
+                * dynamic size
+                *   - the actual dumped size
+                */
+
+               /* Static size. */
+               perf_output_put(handle, dump_size);
+
+               /* Data. */
+               sp = perf_user_stack_pointer(regs);
+               rem = __output_copy_user(handle, (void *) sp, dump_size);
+               dyn_size = dump_size - rem;
+
+               perf_output_skip(handle, rem);
+
+               /* Dynamic size. */
+               perf_output_put(handle, dyn_size);
+       }
+}
+
  static void __perf_event_header__init_id(struct perf_event_header *header,
                                          struct perf_sample_data *data,
                                          struct perf_event *event)
@@ -4024,6 +4151,28 @@ void perf_output_sample(struct perf_output_handle *handle,
                         perf_output_put(handle, nr);
                 }
         }
+
+       if (sample_type & PERF_SAMPLE_REGS_USER) {
+               u64 abi = data->regs_user.abi;
+
+               /*
+                * If there are no regs to dump, notice it through
+                * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
+                */
+               perf_output_put(handle, abi);
+
+               if (abi) {
+                       u64 mask = event->attr.sample_regs_user;
+                       perf_output_sample_regs(handle,
+                                               data->regs_user.regs,
+                                               mask);
+               }
+       }
+
+       if (sample_type & PERF_SAMPLE_STACK_USER)
+               perf_output_sample_ustack(handle,
+                                         data->stack_user_size,
+                                         data->regs_user.regs);
  }
  
  void perf_prepare_sample(struct perf_event_header *header,
@@ -4075,6 +4224,49 @@ void perf_prepare_sample(struct perf_event_header *header,
                 }
                 header->size += size;
         }
+
+       if (sample_type & PERF_SAMPLE_REGS_USER) {
+               /* regs dump ABI info */
+               int size = sizeof(u64);
+
+               perf_sample_regs_user(&data->regs_user, regs);
+
+               if (data->regs_user.regs) {
+                       u64 mask = event->attr.sample_regs_user;
+                       size += hweight64(mask) * sizeof(u64);
+               }
+
+               header->size += size;
+       }
+
+       if (sample_type & PERF_SAMPLE_STACK_USER) {
+               /*
+                * Either we need PERF_SAMPLE_STACK_USER bit to be allways
+                * processed as the last one or have additional check added
+                * in case new sample type is added, because we could eat
+                * up the rest of the sample size.
+                */
+               struct perf_regs_user *uregs = &data->regs_user;
+               u16 stack_size = event->attr.sample_stack_user;
+               u16 size = sizeof(u64);
+
+               if (!uregs->abi)
+                       perf_sample_regs_user(uregs, regs);
+
+               stack_size = perf_sample_ustack_size(stack_size, header->size,
+                                                    uregs->regs);
+
+               /*
+                * If there is something to dump, add space for the dump
+                * itself and for the field that tells the dynamic size,
+                * which is how many have been actually dumped.
+                */
+               if (stack_size)
+                       size += sizeof(u64) + stack_size;
+
+               data->stack_user_size = stack_size;
+               header->size += size;
+       }
  }
  
  static void perf_event_output(struct perf_event *event,
@@ -6151,6 +6343,28 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
                         attr->branch_sample_type = mask;
                 }
         }
+
+       if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
+               ret = perf_reg_validate(attr->sample_regs_user);
+               if (ret)
+                       return ret;
+       }
+
+       if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
+               if (!arch_perf_have_user_stack_dump())
+                       return -ENOSYS;
+
+               /*
+                * We have __u32 type for the size, but so far
+                * we can only use __u16 as maximum due to the
+                * __u16 sample size limit.
+                */
+               if (attr->sample_stack_user >= USHRT_MAX)
+                       ret = -EINVAL;
+               else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
+                       ret = -EINVAL;
+       }
+
  out:
         return ret;
  
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c

index bb38c4d3ee129ab06c1b46dc295ca6864c39c416..9a7b487c6fe240c1a2e4f5c70ef68da6370ebf78 100644 (file)
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -453,7 +453,16 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
         int old_type = bp->attr.bp_type;
         int err = 0;
  
-       perf_event_disable(bp);
+       /*
+        * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
+        * will not be possible to raise IPIs that invoke __perf_event_disable.
+        * So call the function directly after making sure we are targeting the
+        * current task.
+        */
+       if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
+               __perf_event_disable(bp);
+       else
+               perf_event_disable(bp);
  
         bp->attr.bp_addr = attr->bp_addr;
         bp->attr.bp_type = attr->bp_type;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h

index a096c19f2c2a1aba50b62df4b91618cdcdb276d9..d56a64c99a8b1ccf07d3ee252d73048181dcdfd9 100644 (file)
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -2,6 +2,7 @@
  #define _KERNEL_EVENTS_INTERNAL_H
  
  #include <linux/hardirq.h>
+#include <linux/uaccess.h>
  
  /* Buffer handling */
  
@@ -76,30 +77,53 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
         return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
  }
  
-static inline void
-__output_copy(struct perf_output_handle *handle,
-                  const void *buf, unsigned int len)
+#define DEFINE_OUTPUT_COPY(func_name, memcpy_func)                     \
+static inline unsigned int                                             \
+func_name(struct perf_output_handle *handle,                           \
+         const void *buf, unsigned int len)                            \
+{                                                                      \
+       unsigned long size, written;                                    \
+                                                                       \
+       do {                                                            \
+               size = min_t(unsigned long, handle->size, len);         \
+                                                                       \
+               written = memcpy_func(handle->addr, buf, size);         \
+                                                                       \
+               len -= written;                                         \
+               handle->addr += written;                                \
+               buf += written;                                         \
+               handle->size -= written;                                \
+               if (!handle->size) {                                    \
+                       struct ring_buffer *rb = handle->rb;            \
+                                                                       \
+                       handle->page++;                                 \
+                       handle->page &= rb->nr_pages - 1;               \
+                       handle->addr = rb->data_pages[handle->page];    \
+                       handle->size = PAGE_SIZE << page_order(rb);     \
+               }                                                       \
+       } while (len && written == size);                               \
+                                                                       \
+       return len;                                                     \
+}
+
+static inline int memcpy_common(void *dst, const void *src, size_t n)
  {
-       do {
-               unsigned long size = min_t(unsigned long, handle->size, len);
-
-               memcpy(handle->addr, buf, size);
-
-               len -= size;
-               handle->addr += size;
-               buf += size;
-               handle->size -= size;
-               if (!handle->size) {
-                       struct ring_buffer *rb = handle->rb;
-
-                       handle->page++;
-                       handle->page &= rb->nr_pages - 1;
-                       handle->addr = rb->data_pages[handle->page];
-                       handle->size = PAGE_SIZE << page_order(rb);
-               }
-       } while (len);
+       memcpy(dst, src, n);
+       return n;
  }
  
+DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
+
+#define MEMCPY_SKIP(dst, src, n) (n)
+
+DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP)
+
+#ifndef arch_perf_out_copy_user
+#define arch_perf_out_copy_user __copy_from_user_inatomic
+#endif
+
+DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
+
  /* Callchain handling */
  extern struct perf_callchain_entry *
  perf_callchain(struct perf_event *event, struct pt_regs *regs);
@@ -134,4 +158,20 @@ static inline void put_recursion_context(int *recursion, int rctx)
         recursion[rctx]--;
  }
  
+#ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP
+static inline bool arch_perf_have_user_stack_dump(void)
+{
+       return true;
+}
+
+#define perf_user_stack_pointer(regs) user_stack_pointer(regs)
+#else
+static inline bool arch_perf_have_user_stack_dump(void)
+{
+       return false;
+}
+
+#define perf_user_stack_pointer(regs) 0
+#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */
+
  #endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c

index 6ddaba43fb7adf1b792a97bf45baead52cee1fb5..23cb34ff3973c2dc7e4ce9cf41f744f93a36f83a 100644 (file)
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -182,10 +182,16 @@ out:
         return -ENOSPC;
  }
  
-void perf_output_copy(struct perf_output_handle *handle,
+unsigned int perf_output_copy(struct perf_output_handle *handle,
                       const void *buf, unsigned int len)
  {
-       __output_copy(handle, buf, len);
+       return __output_copy(handle, buf, len);
+}
+
+unsigned int perf_output_skip(struct perf_output_handle *handle,
+                             unsigned int len)
+{
+       return __output_skip(handle, NULL, len);
  }
  
  void perf_output_end(struct perf_output_handle *handle)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c

index c08a22d02f7268ffd5e5516fb9d67182d22e5de8..1666632e6edfcfc07c867c91d5b7039dc84ce2f6 100644 (file)
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -280,12 +280,10 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_
         if (ret <= 0)
                 return ret;
  
-       lock_page(page);
         vaddr_new = kmap_atomic(page);
         vaddr &= ~PAGE_MASK;
         memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE);
         kunmap_atomic(vaddr_new);
-       unlock_page(page);
  
         put_page(page);
  
@@ -334,7 +332,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
          */
         result = is_swbp_at_addr(mm, vaddr);
         if (result == 1)
-               return -EEXIST;
+               return 0;
  
         if (result)
                 return result;
@@ -347,24 +345,22 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
   * @mm: the probed process address space.
   * @auprobe: arch specific probepoint information.
   * @vaddr: the virtual address to insert the opcode.
- * @verify: if true, verify existance of breakpoint instruction.
   *
   * For mm @mm, restore the original opcode (opcode) at @vaddr.
   * Return 0 (success) or a negative errno.
   */
  int __weak
-set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify)
+set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
  {
-       if (verify) {
-               int result;
+       int result;
  
-               result = is_swbp_at_addr(mm, vaddr);
-               if (!result)
-                       return -EINVAL;
+       result = is_swbp_at_addr(mm, vaddr);
+       if (!result)
+               return -EINVAL;
+
+       if (result != 1)
+               return result;
  
-               if (result != 1)
-                       return result;
-       }
         return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
  }
  
@@ -649,6 +645,7 @@ static int
  install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
                         struct vm_area_struct *vma, unsigned long vaddr)
  {
+       bool first_uprobe;
         int ret;
  
         /*
@@ -659,7 +656,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
          * Hence behave as if probe already existed.
          */
         if (!uprobe->consumers)
-               return -EEXIST;
+               return 0;
  
         if (!(uprobe->flags & UPROBE_COPY_INSN)) {
                 ret = copy_insn(uprobe, vma->vm_file);
@@ -681,17 +678,16 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
         }
  
         /*
-        * Ideally, should be updating the probe count after the breakpoint
-        * has been successfully inserted. However a thread could hit the
-        * breakpoint we just inserted even before the probe count is
-        * incremented. If this is the first breakpoint placed, breakpoint
-        * notifier might ignore uprobes and pass the trap to the thread.
-        * Hence increment before and decrement on failure.
+        * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
+        * the task can hit this breakpoint right after __replace_page().
          */
-       atomic_inc(&mm->uprobes_state.count);
+       first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
+       if (first_uprobe)
+               set_bit(MMF_HAS_UPROBES, &mm->flags);
+
         ret = set_swbp(&uprobe->arch, mm, vaddr);
-       if (ret)
-               atomic_dec(&mm->uprobes_state.count);
+       if (ret && first_uprobe)
+               clear_bit(MMF_HAS_UPROBES, &mm->flags);
  
         return ret;
  }
@@ -699,8 +695,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
  static void
  remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
  {
-       if (!set_orig_insn(&uprobe->arch, mm, vaddr, true))
-               atomic_dec(&mm->uprobes_state.count);
+       set_orig_insn(&uprobe->arch, mm, vaddr);
  }
  
  /*
@@ -831,17 +826,11 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
                     vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
                         goto unlock;
  
-               if (is_register) {
+               if (is_register)
                         err = install_breakpoint(uprobe, mm, vma, info->vaddr);
-                       /*
-                        * We can race against uprobe_mmap(), see the
-                        * comment near uprobe_hash().
-                        */
-                       if (err == -EEXIST)
-                               err = 0;
-               } else {
+               else
                         remove_breakpoint(uprobe, mm, info->vaddr);
-               }
+
   unlock:
                 up_write(&mm->mmap_sem);
   free:
@@ -1008,23 +997,16 @@ static void build_probe_list(struct inode *inode,
  }
  
  /*
- * Called from mmap_region.
- * called with mm->mmap_sem acquired.
- *
- * Return -ve no if we fail to insert probes and we cannot
- * bail-out.
- * Return 0 otherwise. i.e:
+ * Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
   *
- *     - successful insertion of probes
- *     - (or) no possible probes to be inserted.
- *     - (or) insertion of probes failed but we can bail-out.
+ * Currently we ignore all errors and always return 0, the callers
+ * can't handle the failure anyway.
   */
  int uprobe_mmap(struct vm_area_struct *vma)
  {
         struct list_head tmp_list;
         struct uprobe *uprobe, *u;
         struct inode *inode;
-       int ret, count;
  
         if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
                 return 0;
@@ -1036,44 +1018,16 @@ int uprobe_mmap(struct vm_area_struct *vma)
         mutex_lock(uprobes_mmap_hash(inode));
         build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
  
-       ret = 0;
-       count = 0;
-
         list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
-               if (!ret) {
+               if (!fatal_signal_pending(current)) {
                         unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
-
-                       ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
-                       /*
-                        * We can race against uprobe_register(), see the
-                        * comment near uprobe_hash().
-                        */
-                       if (ret == -EEXIST) {
-                               ret = 0;
-
-                               if (!is_swbp_at_addr(vma->vm_mm, vaddr))
-                                       continue;
-
-                               /*
-                                * Unable to insert a breakpoint, but
-                                * breakpoint lies underneath. Increment the
-                                * probe count.
-                                */
-                               atomic_inc(&vma->vm_mm->uprobes_state.count);
-                       }
-
-                       if (!ret)
-                               count++;
+                       install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
                 }
                 put_uprobe(uprobe);
         }
-
         mutex_unlock(uprobes_mmap_hash(inode));
  
-       if (ret)
-               atomic_sub(count, &vma->vm_mm->uprobes_state.count);
-
-       return ret;
+       return 0;
  }
  
  /*
@@ -1081,37 +1035,16 @@ int uprobe_mmap(struct vm_area_struct *vma)
   */
  void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
  {
-       struct list_head tmp_list;
-       struct uprobe *uprobe, *u;
-       struct inode *inode;
-
         if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
                 return;
  
         if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
                 return;
  
-       if (!atomic_read(&vma->vm_mm->uprobes_state.count))
-               return;
-
-       inode = vma->vm_file->f_mapping->host;
-       if (!inode)
+       if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags))
                 return;
  
-       mutex_lock(uprobes_mmap_hash(inode));
-       build_probe_list(inode, vma, start, end, &tmp_list);
-
-       list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
-               unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
-               /*
-                * An unregister could have removed the probe before
-                * unmap. So check before we decrement the count.
-                */
-               if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
-                       atomic_dec(&vma->vm_mm->uprobes_state.count);
-               put_uprobe(uprobe);
-       }
-       mutex_unlock(uprobes_mmap_hash(inode));
+       /* TODO: unmapping uprobe(s) will need more work */
  }
  
  /* Slot allocation for XOL */
@@ -1213,13 +1146,12 @@ void uprobe_clear_state(struct mm_struct *mm)
         kfree(area);
  }
  
-/*
- * uprobe_reset_state - Free the area allocated for slots.
- */
-void uprobe_reset_state(struct mm_struct *mm)
+void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
  {
-       mm->uprobes_state.xol_area = NULL;
-       atomic_set(&mm->uprobes_state.count, 0);
+       newmm->uprobes_state.xol_area = NULL;
+
+       if (test_bit(MMF_HAS_UPROBES, &oldmm->flags))
+               set_bit(MMF_HAS_UPROBES, &newmm->flags);
  }
  
  /*
@@ -1518,17 +1450,15 @@ cleanup_ret:
                 utask->active_uprobe = NULL;
                 utask->state = UTASK_RUNNING;
         }
-       if (uprobe) {
-               if (!(uprobe->flags & UPROBE_SKIP_SSTEP))
+       if (!(uprobe->flags & UPROBE_SKIP_SSTEP))
  
-                       /*
-                        * cannot singlestep; cannot skip instruction;
-                        * re-execute the instruction.
-                        */
-                       instruction_pointer_set(regs, bp_vaddr);
+               /*
+                * cannot singlestep; cannot skip instruction;
+                * re-execute the instruction.
+                */
+               instruction_pointer_set(regs, bp_vaddr);
  
-               put_uprobe(uprobe);
-       }
+       put_uprobe(uprobe);
  }
  
  /*
@@ -1589,8 +1519,7 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs)
  {
         struct uprobe_task *utask;
  
-       if (!current->mm || !atomic_read(&current->mm->uprobes_state.count))
-               /* task is currently not uprobed */
+       if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags))
                 return 0;
  
         utask = current->utask;
diff --git a/kernel/fork.c b/kernel/fork.c

index 2c8857e12855393759562b3c6eeec2d23de6f080..2343c9eaaaf4e2e69def5b04b52bd7471305acbd 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -353,6 +353,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
  
         down_write(&oldmm->mmap_sem);
         flush_cache_dup_mm(oldmm);
+       uprobe_dup_mmap(oldmm, mm);
         /*
          * Not linked in yet - no deadlock potential:
          */
@@ -454,9 +455,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
  
                 if (retval)
                         goto out;
-
-               if (file)
-                       uprobe_mmap(tmp);
         }
         /* a new mm has just been created */
         arch_dup_mmap(oldmm, mm);
@@ -839,8 +837,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
         mm->pmd_huge_pte = NULL;
  #endif
-       uprobe_reset_state(mm);
-
         if (!mm_init(mm, tsk))
                 goto fail_nomem;
  
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c

index eebd6d5cfb44ce626f669029750112a0cad15525..57d86d07221e33c7863bfe7f789d6a2f129aaa2f 100644 (file)
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -671,6 +671,7 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
         irq_set_chip(irq, chip);
         __irq_set_handler(irq, handle, 0, name);
  }
+EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
  
  void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
  {
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c

index b5fcd96c7102253acc93b1a7239d5a172f927d8f..988dc58e8847f6ebdbcd78348d9f527a9e4f2dfe 100644 (file)
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -6,6 +6,7 @@
   */
  #include <linux/interrupt.h>
  #include <linux/irq.h>
+#include <linux/export.h>
  
  #include "internals.h"
  
@@ -57,3 +58,4 @@ struct irq_chip dummy_irq_chip = {
         .irq_mask       = noop,
         .irq_unmask     = noop,
  };
+EXPORT_SYMBOL_GPL(dummy_irq_chip);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c

index c62b8546cc90e0b39de0e824b0f7ed7179d8ba92..35b4315d84f578104a0ea322861d43773c52e625 100644 (file)
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -561,9 +561,9 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
  {
         LIST_HEAD(free_list);
  
+       mutex_lock(&kprobe_mutex);
         /* Lock modules while optimizing kprobes */
         mutex_lock(&module_mutex);
-       mutex_lock(&kprobe_mutex);
  
         /*
          * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
@@ -586,8 +586,8 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
         /* Step 4: Free cleaned kprobes after quiesence period */
         do_free_cleaned_kprobes(&free_list);
  
-       mutex_unlock(&kprobe_mutex);
         mutex_unlock(&module_mutex);
+       mutex_unlock(&kprobe_mutex);
  
         /* Step 5: Kick optimizer again if needed */
         if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
@@ -759,20 +759,32 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
         struct kprobe *ap;
         struct optimized_kprobe *op;
  
+       /* Impossible to optimize ftrace-based kprobe */
+       if (kprobe_ftrace(p))
+               return;
+
+       /* For preparing optimization, jump_label_text_reserved() is called */
+       jump_label_lock();
+       mutex_lock(&text_mutex);
+
         ap = alloc_aggr_kprobe(p);
         if (!ap)
-               return;
+               goto out;
  
         op = container_of(ap, struct optimized_kprobe, kp);
         if (!arch_prepared_optinsn(&op->optinsn)) {
                 /* If failed to setup optimizing, fallback to kprobe */
                 arch_remove_optimized_kprobe(op);
                 kfree(op);
-               return;
+               goto out;
         }
  
         init_aggr_kprobe(ap, p);
-       optimize_kprobe(ap);
+       optimize_kprobe(ap);    /* This just kicks optimizer thread */
+
+out:
+       mutex_unlock(&text_mutex);
+       jump_label_unlock();
  }
  
  #ifdef CONFIG_SYSCTL
@@ -907,9 +919,64 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
  }
  #endif /* CONFIG_OPTPROBES */
  
+#ifdef KPROBES_CAN_USE_FTRACE
+static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
+       .func = kprobe_ftrace_handler,
+       .flags = FTRACE_OPS_FL_SAVE_REGS,
+};
+static int kprobe_ftrace_enabled;
+
+/* Must ensure p->addr is really on ftrace */
+static int __kprobes prepare_kprobe(struct kprobe *p)
+{
+       if (!kprobe_ftrace(p))
+               return arch_prepare_kprobe(p);
+
+       return arch_prepare_kprobe_ftrace(p);
+}
+
+/* Caller must lock kprobe_mutex */
+static void __kprobes arm_kprobe_ftrace(struct kprobe *p)
+{
+       int ret;
+
+       ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
+                                  (unsigned long)p->addr, 0, 0);
+       WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret);
+       kprobe_ftrace_enabled++;
+       if (kprobe_ftrace_enabled == 1) {
+               ret = register_ftrace_function(&kprobe_ftrace_ops);
+               WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
+       }
+}
+
+/* Caller must lock kprobe_mutex */
+static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
+{
+       int ret;
+
+       kprobe_ftrace_enabled--;
+       if (kprobe_ftrace_enabled == 0) {
+               ret = unregister_ftrace_function(&kprobe_ftrace_ops);
+               WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
+       }
+       ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
+                          (unsigned long)p->addr, 1, 0);
+       WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
+}
+#else  /* !KPROBES_CAN_USE_FTRACE */
+#define prepare_kprobe(p)      arch_prepare_kprobe(p)
+#define arm_kprobe_ftrace(p)   do {} while (0)
+#define disarm_kprobe_ftrace(p)        do {} while (0)
+#endif
+
  /* Arm a kprobe with text_mutex */
  static void __kprobes arm_kprobe(struct kprobe *kp)
  {
+       if (unlikely(kprobe_ftrace(kp))) {
+               arm_kprobe_ftrace(kp);
+               return;
+       }
         /*
          * Here, since __arm_kprobe() doesn't use stop_machine(),
          * this doesn't cause deadlock on text_mutex. So, we don't
@@ -921,11 +988,15 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
  }
  
  /* Disarm a kprobe with text_mutex */
-static void __kprobes disarm_kprobe(struct kprobe *kp)
+static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt)
  {
+       if (unlikely(kprobe_ftrace(kp))) {
+               disarm_kprobe_ftrace(kp);
+               return;
+       }
         /* Ditto */
         mutex_lock(&text_mutex);
-       __disarm_kprobe(kp, true);
+       __disarm_kprobe(kp, reopt);
         mutex_unlock(&text_mutex);
  }
  
@@ -1144,12 +1215,6 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
         if (p->post_handler && !ap->post_handler)
                 ap->post_handler = aggr_post_handler;
  
-       if (kprobe_disabled(ap) && !kprobe_disabled(p)) {
-               ap->flags &= ~KPROBE_FLAG_DISABLED;
-               if (!kprobes_all_disarmed)
-                       /* Arm the breakpoint again. */
-                       __arm_kprobe(ap);
-       }
         return 0;
  }
  
@@ -1189,11 +1254,22 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
         int ret = 0;
         struct kprobe *ap = orig_p;
  
+       /* For preparing optimization, jump_label_text_reserved() is called */
+       jump_label_lock();
+       /*
+        * Get online CPUs to avoid text_mutex deadlock.with stop machine,
+        * which is invoked by unoptimize_kprobe() in add_new_kprobe()
+        */
+       get_online_cpus();
+       mutex_lock(&text_mutex);
+
         if (!kprobe_aggrprobe(orig_p)) {
                 /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
                 ap = alloc_aggr_kprobe(orig_p);
-               if (!ap)
-                       return -ENOMEM;
+               if (!ap) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
                 init_aggr_kprobe(ap, orig_p);
         } else if (kprobe_unused(ap))
                 /* This probe is going to die. Rescue it */
@@ -1213,7 +1289,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
                          * free aggr_probe. It will be used next time, or
                          * freed by unregister_kprobe.
                          */
-                       return ret;
+                       goto out;
  
                 /* Prepare optimized instructions if possible. */
                 prepare_optimized_kprobe(ap);
@@ -1228,7 +1304,20 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
  
         /* Copy ap's insn slot to p */
         copy_kprobe(ap, p);
-       return add_new_kprobe(ap, p);
+       ret = add_new_kprobe(ap, p);
+
+out:
+       mutex_unlock(&text_mutex);
+       put_online_cpus();
+       jump_label_unlock();
+
+       if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
+               ap->flags &= ~KPROBE_FLAG_DISABLED;
+               if (!kprobes_all_disarmed)
+                       /* Arm the breakpoint again. */
+                       arm_kprobe(ap);
+       }
+       return ret;
  }
  
  static int __kprobes in_kprobes_functions(unsigned long addr)
@@ -1313,71 +1402,99 @@ static inline int check_kprobe_rereg(struct kprobe *p)
         return ret;
  }
  
-int __kprobes register_kprobe(struct kprobe *p)
+static __kprobes int check_kprobe_address_safe(struct kprobe *p,
+                                              struct module **probed_mod)
  {
         int ret = 0;
-       struct kprobe *old_p;
-       struct module *probed_mod;
-       kprobe_opcode_t *addr;
-
-       addr = kprobe_addr(p);
-       if (IS_ERR(addr))
-               return PTR_ERR(addr);
-       p->addr = addr;
+       unsigned long ftrace_addr;
  
-       ret = check_kprobe_rereg(p);
-       if (ret)
-               return ret;
+       /*
+        * If the address is located on a ftrace nop, set the
+        * breakpoint to the following instruction.
+        */
+       ftrace_addr = ftrace_location((unsigned long)p->addr);
+       if (ftrace_addr) {
+#ifdef KPROBES_CAN_USE_FTRACE
+               /* Given address is not on the instruction boundary */
+               if ((unsigned long)p->addr != ftrace_addr)
+                       return -EILSEQ;
+               /* break_handler (jprobe) can not work with ftrace */
+               if (p->break_handler)
+                       return -EINVAL;
+               p->flags |= KPROBE_FLAG_FTRACE;
+#else  /* !KPROBES_CAN_USE_FTRACE */
+               return -EINVAL;
+#endif
+       }
  
         jump_label_lock();
         preempt_disable();
+
+       /* Ensure it is not in reserved area nor out of text */
         if (!kernel_text_address((unsigned long) p->addr) ||
             in_kprobes_functions((unsigned long) p->addr) ||
-           ftrace_text_reserved(p->addr, p->addr) ||
             jump_label_text_reserved(p->addr, p->addr)) {
                 ret = -EINVAL;
-               goto cannot_probe;
+               goto out;
         }
  
-       /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
-       p->flags &= KPROBE_FLAG_DISABLED;
-
-       /*
-        * Check if are we probing a module.
-        */
-       probed_mod = __module_text_address((unsigned long) p->addr);
-       if (probed_mod) {
-               /* Return -ENOENT if fail. */
-               ret = -ENOENT;
+       /* Check if are we probing a module */
+       *probed_mod = __module_text_address((unsigned long) p->addr);
+       if (*probed_mod) {
                 /*
                  * We must hold a refcount of the probed module while updating
                  * its code to prohibit unexpected unloading.
                  */
-               if (unlikely(!try_module_get(probed_mod)))
-                       goto cannot_probe;
+               if (unlikely(!try_module_get(*probed_mod))) {
+                       ret = -ENOENT;
+                       goto out;
+               }
  
                 /*
                  * If the module freed .init.text, we couldn't insert
                  * kprobes in there.
                  */
-               if (within_module_init((unsigned long)p->addr, probed_mod) &&
-                   probed_mod->state != MODULE_STATE_COMING) {
-                       module_put(probed_mod);
-                       goto cannot_probe;
+               if (within_module_init((unsigned long)p->addr, *probed_mod) &&
+                   (*probed_mod)->state != MODULE_STATE_COMING) {
+                       module_put(*probed_mod);
+                       *probed_mod = NULL;
+                       ret = -ENOENT;
                 }
-               /* ret will be updated by following code */
         }
+out:
         preempt_enable();
         jump_label_unlock();
  
+       return ret;
+}
+
+int __kprobes register_kprobe(struct kprobe *p)
+{
+       int ret;
+       struct kprobe *old_p;
+       struct module *probed_mod;
+       kprobe_opcode_t *addr;
+
+       /* Adjust probe address from symbol */
+       addr = kprobe_addr(p);
+       if (IS_ERR(addr))
+               return PTR_ERR(addr);
+       p->addr = addr;
+
+       ret = check_kprobe_rereg(p);
+       if (ret)
+               return ret;
+
+       /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
+       p->flags &= KPROBE_FLAG_DISABLED;
         p->nmissed = 0;
         INIT_LIST_HEAD(&p->list);
-       mutex_lock(&kprobe_mutex);
  
-       jump_label_lock(); /* needed to call jump_label_text_reserved() */
+       ret = check_kprobe_address_safe(p, &probed_mod);
+       if (ret)
+               return ret;
  
-       get_online_cpus();      /* For avoiding text_mutex deadlock. */
-       mutex_lock(&text_mutex);
+       mutex_lock(&kprobe_mutex);
  
         old_p = get_kprobe(p->addr);
         if (old_p) {
@@ -1386,7 +1503,9 @@ int __kprobes register_kprobe(struct kprobe *p)
                 goto out;
         }
  
-       ret = arch_prepare_kprobe(p);
+       mutex_lock(&text_mutex);        /* Avoiding text modification */
+       ret = prepare_kprobe(p);
+       mutex_unlock(&text_mutex);
         if (ret)
                 goto out;
  
@@ -1395,26 +1514,18 @@ int __kprobes register_kprobe(struct kprobe *p)
                        &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
  
         if (!kprobes_all_disarmed && !kprobe_disabled(p))
-               __arm_kprobe(p);
+               arm_kprobe(p);
  
         /* Try to optimize kprobe */
         try_to_optimize_kprobe(p);
  
  out:
-       mutex_unlock(&text_mutex);
-       put_online_cpus();
-       jump_label_unlock();
         mutex_unlock(&kprobe_mutex);
  
         if (probed_mod)
                 module_put(probed_mod);
  
         return ret;
-
-cannot_probe:
-       preempt_enable();
-       jump_label_unlock();
-       return ret;
  }
  EXPORT_SYMBOL_GPL(register_kprobe);
  
@@ -1451,7 +1562,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
  
                 /* Try to disarm and disable this/parent probe */
                 if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
-                       disarm_kprobe(orig_p);
+                       disarm_kprobe(orig_p, true);
                         orig_p->flags |= KPROBE_FLAG_DISABLED;
                 }
         }
@@ -2049,10 +2160,11 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
  
         if (!pp)
                 pp = p;
-       seq_printf(pi, "%s%s%s\n",
+       seq_printf(pi, "%s%s%s%s\n",
                 (kprobe_gone(p) ? "[GONE]" : ""),
                 ((kprobe_disabled(p) && !kprobe_gone(p)) ?  "[DISABLED]" : ""),
-               (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
+               (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""),
+               (kprobe_ftrace(pp) ? "[FTRACE]" : ""));
  }
  
  static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -2131,14 +2243,12 @@ static void __kprobes arm_all_kprobes(void)
                 goto already_enabled;
  
         /* Arming kprobes doesn't optimize kprobe itself */
-       mutex_lock(&text_mutex);
         for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                 head = &kprobe_table[i];
                 hlist_for_each_entry_rcu(p, node, head, hlist)
                         if (!kprobe_disabled(p))
-                               __arm_kprobe(p);
+                               arm_kprobe(p);
         }
-       mutex_unlock(&text_mutex);
  
         kprobes_all_disarmed = false;
         printk(KERN_INFO "Kprobes globally enabled\n");
@@ -2166,15 +2276,13 @@ static void __kprobes disarm_all_kprobes(void)
         kprobes_all_disarmed = true;
         printk(KERN_INFO "Kprobes globally disabled\n");
  
-       mutex_lock(&text_mutex);
         for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                 head = &kprobe_table[i];
                 hlist_for_each_entry_rcu(p, node, head, hlist) {
                         if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
-                               __disarm_kprobe(p, false);
+                               disarm_kprobe(p, false);
                 }
         }
-       mutex_unlock(&text_mutex);
         mutex_unlock(&kprobe_mutex);
  
         /* Wait for disarming all kprobes by optimizer */
diff --git a/kernel/kthread.c b/kernel/kthread.c

index b579af57ea107af579e6a155730258166dd4e176..146a6fa968254fe00b4aa6a4a423113f083b3e89 100644 (file)
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -37,11 +37,20 @@ struct kthread_create_info
  };
  
  struct kthread {
-       int should_stop;
+       unsigned long flags;
+       unsigned int cpu;
         void *data;
+       struct completion parked;
         struct completion exited;
  };
  
+enum KTHREAD_BITS {
+       KTHREAD_IS_PER_CPU = 0,
+       KTHREAD_SHOULD_STOP,
+       KTHREAD_SHOULD_PARK,
+       KTHREAD_IS_PARKED,
+};
+
  #define to_kthread(tsk)        \
         container_of((tsk)->vfork_done, struct kthread, exited)
  
@@ -52,12 +61,28 @@ struct kthread {
   * and this will return true.  You should then return, and your return
   * value will be passed through to kthread_stop().
   */
-int kthread_should_stop(void)
+bool kthread_should_stop(void)
  {
-       return to_kthread(current)->should_stop;
+       return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
  }
  EXPORT_SYMBOL(kthread_should_stop);
  
+/**
+ * kthread_should_park - should this kthread park now?
+ *
+ * When someone calls kthread_park() on your kthread, it will be woken
+ * and this will return true.  You should then do the necessary
+ * cleanup and call kthread_parkme()
+ *
+ * Similar to kthread_should_stop(), but this keeps the thread alive
+ * and in a park position. kthread_unpark() "restarts" the thread and
+ * calls the thread function again.
+ */
+bool kthread_should_park(void)
+{
+       return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
+}
+
  /**
   * kthread_freezable_should_stop - should this freezable kthread return now?
   * @was_frozen: optional out parameter, indicates whether %current was frozen
@@ -96,6 +121,24 @@ void *kthread_data(struct task_struct *task)
         return to_kthread(task)->data;
  }
  
+static void __kthread_parkme(struct kthread *self)
+{
+       __set_current_state(TASK_INTERRUPTIBLE);
+       while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
+               if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
+                       complete(&self->parked);
+               schedule();
+               __set_current_state(TASK_INTERRUPTIBLE);
+       }
+       clear_bit(KTHREAD_IS_PARKED, &self->flags);
+       __set_current_state(TASK_RUNNING);
+}
+
+void kthread_parkme(void)
+{
+       __kthread_parkme(to_kthread(current));
+}
+
  static int kthread(void *_create)
  {
         /* Copy data: it's on kthread's stack */
@@ -105,9 +148,10 @@ static int kthread(void *_create)
         struct kthread self;
         int ret;
  
-       self.should_stop = 0;
+       self.flags = 0;
         self.data = data;
         init_completion(&self.exited);
+       init_completion(&self.parked);
         current->vfork_done = &self.exited;
  
         /* OK, tell user we're spawned, wait for stop or wakeup */
@@ -117,9 +161,11 @@ static int kthread(void *_create)
         schedule();
  
         ret = -EINTR;
-       if (!self.should_stop)
-               ret = threadfn(data);
  
+       if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
+               __kthread_parkme(&self);
+               ret = threadfn(data);
+       }
         /* we can't just return, we must preserve "self" on stack */
         do_exit(ret);
  }
@@ -172,8 +218,7 @@ static void create_kthread(struct kthread_create_info *create)
   * Returns a task_struct or ERR_PTR(-ENOMEM).
   */
  struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
-                                          void *data,
-                                          int node,
+                                          void *data, int node,
                                            const char namefmt[],
                                            ...)
  {
@@ -210,6 +255,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
  }
  EXPORT_SYMBOL(kthread_create_on_node);
  
+static void __kthread_bind(struct task_struct *p, unsigned int cpu)
+{
+       /* It's safe because the task is inactive. */
+       do_set_cpus_allowed(p, cpumask_of(cpu));
+       p->flags |= PF_THREAD_BOUND;
+}
+
  /**
   * kthread_bind - bind a just-created kthread to a cpu.
   * @p: thread created by kthread_create().
@@ -226,13 +278,111 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
                 WARN_ON(1);
                 return;
         }
-
-       /* It's safe because the task is inactive. */
-       do_set_cpus_allowed(p, cpumask_of(cpu));
-       p->flags |= PF_THREAD_BOUND;
+       __kthread_bind(p, cpu);
  }
  EXPORT_SYMBOL(kthread_bind);
  
+/**
+ * kthread_create_on_cpu - Create a cpu bound kthread
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @cpu: The cpu on which the thread should be bound,
+ * @namefmt: printf-style name for the thread. Format is restricted
+ *          to "name.*%u". Code fills in cpu number.
+ *
+ * Description: This helper function creates and names a kernel thread
+ * The thread will be woken and put into park mode.
+ */
+struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
+                                         void *data, unsigned int cpu,
+                                         const char *namefmt)
+{
+       struct task_struct *p;
+
+       p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
+                                  cpu);
+       if (IS_ERR(p))
+               return p;
+       set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
+       to_kthread(p)->cpu = cpu;
+       /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */
+       kthread_park(p);
+       return p;
+}
+
+static struct kthread *task_get_live_kthread(struct task_struct *k)
+{
+       struct kthread *kthread;
+
+       get_task_struct(k);
+       kthread = to_kthread(k);
+       /* It might have exited */
+       barrier();
+       if (k->vfork_done != NULL)
+               return kthread;
+       return NULL;
+}
+
+/**
+ * kthread_unpark - unpark a thread created by kthread_create().
+ * @k:         thread created by kthread_create().
+ *
+ * Sets kthread_should_park() for @k to return false, wakes it, and
+ * waits for it to return. If the thread is marked percpu then its
+ * bound to the cpu again.
+ */
+void kthread_unpark(struct task_struct *k)
+{
+       struct kthread *kthread = task_get_live_kthread(k);
+
+       if (kthread) {
+               clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+               /*
+                * We clear the IS_PARKED bit here as we don't wait
+                * until the task has left the park code. So if we'd
+                * park before that happens we'd see the IS_PARKED bit
+                * which might be about to be cleared.
+                */
+               if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+                       if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+                               __kthread_bind(k, kthread->cpu);
+                       wake_up_process(k);
+               }
+       }
+       put_task_struct(k);
+}
+
+/**
+ * kthread_park - park a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_park() for @k to return true, wakes it, and
+ * waits for it to return. This can also be called after kthread_create()
+ * instead of calling wake_up_process(): the thread will park without
+ * calling threadfn().
+ *
+ * Returns 0 if the thread is parked, -ENOSYS if the thread exited.
+ * If called by the kthread itself just the park bit is set.
+ */
+int kthread_park(struct task_struct *k)
+{
+       struct kthread *kthread = task_get_live_kthread(k);
+       int ret = -ENOSYS;
+
+       if (kthread) {
+               if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+                       set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+                       if (k != current) {
+                               wake_up_process(k);
+                               wait_for_completion(&kthread->parked);
+                       }
+               }
+               ret = 0;
+       }
+       put_task_struct(k);
+       return ret;
+}
+
  /**
   * kthread_stop - stop a thread created by kthread_create().
   * @k: thread created by kthread_create().
@@ -250,16 +400,13 @@ EXPORT_SYMBOL(kthread_bind);
   */
  int kthread_stop(struct task_struct *k)
  {
-       struct kthread *kthread;
+       struct kthread *kthread = task_get_live_kthread(k);
         int ret;
  
         trace_sched_kthread_stop(k);
-       get_task_struct(k);
-
-       kthread = to_kthread(k);
-       barrier(); /* it might have exited */
-       if (k->vfork_done != NULL) {
-               kthread->should_stop = 1;
+       if (kthread) {
+               set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
+               clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
                 wake_up_process(k);
                 wait_for_completion(&kthread->exited);
         }
diff --git a/kernel/rcutree.c b/kernel/rcutree.c

index f280e542e3e9f531df83b03d1e1fce6cf8ebaaa2..11a4fdca1df747e4a21303b33b12df6785ea1ee6 100644 (file)
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -133,13 +133,12 @@ static int rcu_scheduler_fully_active __read_mostly;
   */
  static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
  DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
  DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
  DEFINE_PER_CPU(char, rcu_cpu_has_work);
  
  #endif /* #ifdef CONFIG_RCU_BOOST */
  
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
  static void invoke_rcu_core(void);
  static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
  
@@ -1468,8 +1467,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
         struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
  
         /* Adjust any no-longer-needed kthreads. */
-       rcu_stop_cpu_kthread(cpu);
-       rcu_node_kthread_setaffinity(rnp, -1);
+       rcu_boost_kthread_setaffinity(rnp, -1);
  
         /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
  
@@ -2594,12 +2592,10 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
                 break;
         case CPU_ONLINE:
         case CPU_DOWN_FAILED:
-               rcu_node_kthread_setaffinity(rnp, -1);
-               rcu_cpu_kthread_setrt(cpu, 1);
+               rcu_boost_kthread_setaffinity(rnp, -1);
                 break;
         case CPU_DOWN_PREPARE:
-               rcu_node_kthread_setaffinity(rnp, cpu);
-               rcu_cpu_kthread_setrt(cpu, 0);
+               rcu_boost_kthread_setaffinity(rnp, cpu);
                 break;
         case CPU_DYING:
         case CPU_DYING_FROZEN:
diff --git a/kernel/rcutree.h b/kernel/rcutree.h

index 4d29169f212468bdc6f8dd17311ecf8bdd6850a8..1224d4c053823fe7ecdbc2bab9590779c5cc9b7c 100644 (file)
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -196,12 +196,6 @@ struct rcu_node {
                                 /* Refused to boost: not sure why, though. */
                                 /*  This can happen due to race conditions. */
  #endif /* #ifdef CONFIG_RCU_BOOST */
-       struct task_struct *node_kthread_task;
-                               /* kthread that takes care of this rcu_node */
-                               /*  structure, for example, awakening the */
-                               /*  per-CPU kthreads as needed. */
-       unsigned int node_kthread_status;
-                               /* State of node_kthread_task for tracing. */
  } ____cacheline_internodealigned_in_smp;
  
  /*
@@ -468,7 +462,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
  #ifdef CONFIG_HOTPLUG_CPU
  static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
                                       unsigned long flags);
-static void rcu_stop_cpu_kthread(int cpu);
  #endif /* #ifdef CONFIG_HOTPLUG_CPU */
  static void rcu_print_detail_task_stall(struct rcu_state *rsp);
  static int rcu_print_task_stall(struct rcu_node *rnp);
@@ -491,15 +484,9 @@ static void invoke_rcu_callbacks_kthread(void);
  static bool rcu_is_callbacks_kthread(void);
  #ifdef CONFIG_RCU_BOOST
  static void rcu_preempt_do_callbacks(void);
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
-                                         cpumask_var_t cm);
  static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
-                                                struct rcu_node *rnp,
-                                                int rnp_index);
-static void invoke_rcu_node_kthread(struct rcu_node *rnp);
-static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
+                                                struct rcu_node *rnp);
  #endif /* #ifdef CONFIG_RCU_BOOST */
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
  static void __cpuinit rcu_prepare_kthreads(int cpu);
  static void rcu_prepare_for_idle_init(int cpu);
  static void rcu_cleanup_after_idle(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h

index 7f3244c0df014a7673831159ae1dbc25ceff55c8..c1961aed12138a39e04ee352d1a070dd238dceb1 100644 (file)
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
   */
  
  #include <linux/delay.h>
+#include <linux/smpboot.h>
  
  #define RCU_KTHREAD_PRIO 1
  
@@ -1069,6 +1070,16 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
  
  #endif /* #else #ifdef CONFIG_RCU_TRACE */
  
+static void rcu_wake_cond(struct task_struct *t, int status)
+{
+       /*
+        * If the thread is yielding, only wake it when this
+        * is invoked from idle
+        */
+       if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
+               wake_up_process(t);
+}
+
  /*
   * Carry out RCU priority boosting on the task indicated by ->exp_tasks
   * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1140,17 +1151,6 @@ static int rcu_boost(struct rcu_node *rnp)
                ACCESS_ONCE(rnp->boost_tasks) != NULL;
  }
  
-/*
- * Timer handler to initiate waking up of boost kthreads that
- * have yielded the CPU due to excessive numbers of tasks to
- * boost.  We wake up the per-rcu_node kthread, which in turn
- * will wake up the booster kthread.
- */
-static void rcu_boost_kthread_timer(unsigned long arg)
-{
-       invoke_rcu_node_kthread((struct rcu_node *)arg);
-}
-
  /*
   * Priority-boosting kthread.  One per leaf rcu_node and one for the
   * root rcu_node.
@@ -1174,8 +1174,9 @@ static int rcu_boost_kthread(void *arg)
                 else
                         spincnt = 0;
                 if (spincnt > 10) {
+                       rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
                         trace_rcu_utilization("End boost kthread@rcu_yield");
-                       rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
+                       schedule_timeout_interruptible(2);
                         trace_rcu_utilization("Start boost kthread@rcu_yield");
                         spincnt = 0;
                 }
@@ -1213,8 +1214,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
                         rnp->boost_tasks = rnp->gp_tasks;
                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
                 t = rnp->boost_kthread_task;
-               if (t != NULL)
-                       wake_up_process(t);
+               if (t)
+                       rcu_wake_cond(t, rnp->boost_kthread_status);
         } else {
                 rcu_initiate_boost_trace(rnp);
                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1231,8 +1232,10 @@ static void invoke_rcu_callbacks_kthread(void)
         local_irq_save(flags);
         __this_cpu_write(rcu_cpu_has_work, 1);
         if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
-           current != __this_cpu_read(rcu_cpu_kthread_task))
-               wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
+           current != __this_cpu_read(rcu_cpu_kthread_task)) {
+               rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
+                             __this_cpu_read(rcu_cpu_kthread_status));
+       }
         local_irq_restore(flags);
  }
  
@@ -1245,21 +1248,6 @@ static bool rcu_is_callbacks_kthread(void)
         return __get_cpu_var(rcu_cpu_kthread_task) == current;
  }
  
-/*
- * Set the affinity of the boost kthread.  The CPU-hotplug locks are
- * held, so no one should be messing with the existence of the boost
- * kthread.
- */
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
-                                         cpumask_var_t cm)
-{
-       struct task_struct *t;
-
-       t = rnp->boost_kthread_task;
-       if (t != NULL)
-               set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
-}
-
  #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
  
  /*
@@ -1276,15 +1264,19 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
   * Returns zero if all is well, a negated errno otherwise.
   */
  static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
-                                                struct rcu_node *rnp,
-                                                int rnp_index)
+                                                struct rcu_node *rnp)
  {
+       int rnp_index = rnp - &rsp->node[0];
         unsigned long flags;
         struct sched_param sp;
         struct task_struct *t;
  
         if (&rcu_preempt_state != rsp)
                 return 0;
+
+       if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
+               return 0;
+
         rsp->boost = 1;
         if (rnp->boost_kthread_task != NULL)
                 return 0;
@@ -1301,25 +1293,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
         return 0;
  }
  
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Stop the RCU's per-CPU kthread when its CPU goes offline,.
- */
-static void rcu_stop_cpu_kthread(int cpu)
-{
-       struct task_struct *t;
-
-       /* Stop the CPU's kthread. */
-       t = per_cpu(rcu_cpu_kthread_task, cpu);
-       if (t != NULL) {
-               per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
-               kthread_stop(t);
-       }
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
  static void rcu_kthread_do_work(void)
  {
         rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
@@ -1327,112 +1300,22 @@ static void rcu_kthread_do_work(void)
         rcu_preempt_do_callbacks();
  }
  
-/*
- * Wake up the specified per-rcu_node-structure kthread.
- * Because the per-rcu_node kthreads are immortal, we don't need
- * to do anything to keep them alive.
- */
-static void invoke_rcu_node_kthread(struct rcu_node *rnp)
+static void rcu_cpu_kthread_setup(unsigned int cpu)
  {
-       struct task_struct *t;
-
-       t = rnp->node_kthread_task;
-       if (t != NULL)
-               wake_up_process(t);
-}
-
-/*
- * Set the specified CPU's kthread to run RT or not, as specified by
- * the to_rt argument.  The CPU-hotplug locks are held, so the task
- * is not going away.
- */
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
-{
-       int policy;
         struct sched_param sp;
-       struct task_struct *t;
-
-       t = per_cpu(rcu_cpu_kthread_task, cpu);
-       if (t == NULL)
-               return;
-       if (to_rt) {
-               policy = SCHED_FIFO;
-               sp.sched_priority = RCU_KTHREAD_PRIO;
-       } else {
-               policy = SCHED_NORMAL;
-               sp.sched_priority = 0;
-       }
-       sched_setscheduler_nocheck(t, policy, &sp);
-}
-
-/*
- * Timer handler to initiate the waking up of per-CPU kthreads that
- * have yielded the CPU due to excess numbers of RCU callbacks.
- * We wake up the per-rcu_node kthread, which in turn will wake up
- * the booster kthread.
- */
-static void rcu_cpu_kthread_timer(unsigned long arg)
-{
-       struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
-       struct rcu_node *rnp = rdp->mynode;
  
-       atomic_or(rdp->grpmask, &rnp->wakemask);
-       invoke_rcu_node_kthread(rnp);
+       sp.sched_priority = RCU_KTHREAD_PRIO;
+       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
  }
  
-/*
- * Drop to non-real-time priority and yield, but only after posting a
- * timer that will cause us to regain our real-time priority if we
- * remain preempted.  Either way, we restore our real-time priority
- * before returning.
- */
-static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
+static void rcu_cpu_kthread_park(unsigned int cpu)
  {
-       struct sched_param sp;
-       struct timer_list yield_timer;
-       int prio = current->rt_priority;
-
-       setup_timer_on_stack(&yield_timer, f, arg);
-       mod_timer(&yield_timer, jiffies + 2);
-       sp.sched_priority = 0;
-       sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
-       set_user_nice(current, 19);
-       schedule();
-       set_user_nice(current, 0);
-       sp.sched_priority = prio;
-       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
-       del_timer(&yield_timer);
+       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
  }
  
-/*
- * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
- * This can happen while the corresponding CPU is either coming online
- * or going offline.  We cannot wait until the CPU is fully online
- * before starting the kthread, because the various notifier functions
- * can wait for RCU grace periods.  So we park rcu_cpu_kthread() until
- * the corresponding CPU is online.
- *
- * Return 1 if the kthread needs to stop, 0 otherwise.
- *
- * Caller must disable bh.  This function can momentarily enable it.
- */
-static int rcu_cpu_kthread_should_stop(int cpu)
+static int rcu_cpu_kthread_should_run(unsigned int cpu)
  {
-       while (cpu_is_offline(cpu) ||
-              !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
-              smp_processor_id() != cpu) {
-               if (kthread_should_stop())
-                       return 1;
-               per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
-               per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
-               local_bh_enable();
-               schedule_timeout_uninterruptible(1);
-               if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
-                       set_cpus_allowed_ptr(current, cpumask_of(cpu));
-               local_bh_disable();
-       }
-       per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
-       return 0;
+       return __get_cpu_var(rcu_cpu_has_work);
  }
  
  /*
@@ -1440,138 +1323,35 @@ static int rcu_cpu_kthread_should_stop(int cpu)
   * RCU softirq used in flavors and configurations of RCU that do not
   * support RCU priority boosting.
   */
-static int rcu_cpu_kthread(void *arg)
+static void rcu_cpu_kthread(unsigned int cpu)
  {
-       int cpu = (int)(long)arg;
-       unsigned long flags;
-       int spincnt = 0;
-       unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
-       char work;
-       char *workp = &per_cpu(rcu_cpu_has_work, cpu);
+       unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
+       char work, *workp = &__get_cpu_var(rcu_cpu_has_work);
+       int spincnt;
  
-       trace_rcu_utilization("Start CPU kthread@init");
-       for (;;) {
-               *statusp = RCU_KTHREAD_WAITING;
-               trace_rcu_utilization("End CPU kthread@rcu_wait");
-               rcu_wait(*workp != 0 || kthread_should_stop());
+       for (spincnt = 0; spincnt < 10; spincnt++) {
                 trace_rcu_utilization("Start CPU kthread@rcu_wait");
                 local_bh_disable();
-               if (rcu_cpu_kthread_should_stop(cpu)) {
-                       local_bh_enable();
-                       break;
-               }
                 *statusp = RCU_KTHREAD_RUNNING;
-               per_cpu(rcu_cpu_kthread_loops, cpu)++;
-               local_irq_save(flags);
+               this_cpu_inc(rcu_cpu_kthread_loops);
+               local_irq_disable();
                 work = *workp;
                 *workp = 0;
-               local_irq_restore(flags);
+               local_irq_enable();
                 if (work)
                         rcu_kthread_do_work();
                 local_bh_enable();
-               if (*workp != 0)
-                       spincnt++;
-               else
-                       spincnt = 0;
-               if (spincnt > 10) {
-                       *statusp = RCU_KTHREAD_YIELDING;
-                       trace_rcu_utilization("End CPU kthread@rcu_yield");
-                       rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
-                       trace_rcu_utilization("Start CPU kthread@rcu_yield");
-                       spincnt = 0;
-               }
-       }
-       *statusp = RCU_KTHREAD_STOPPED;
-       trace_rcu_utilization("End CPU kthread@term");
-       return 0;
-}
-
-/*
- * Spawn a per-CPU kthread, setting up affinity and priority.
- * Because the CPU hotplug lock is held, no other CPU will be attempting
- * to manipulate rcu_cpu_kthread_task.  There might be another CPU
- * attempting to access it during boot, but the locking in kthread_bind()
- * will enforce sufficient ordering.
- *
- * Please note that we cannot simply refuse to wake up the per-CPU
- * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
- * which can result in softlockup complaints if the task ends up being
- * idle for more than a couple of minutes.
- *
- * However, please note also that we cannot bind the per-CPU kthread to its
- * CPU until that CPU is fully online.  We also cannot wait until the
- * CPU is fully online before we create its per-CPU kthread, as this would
- * deadlock the system when CPU notifiers tried waiting for grace
- * periods.  So we bind the per-CPU kthread to its CPU only if the CPU
- * is online.  If its CPU is not yet fully online, then the code in
- * rcu_cpu_kthread() will wait until it is fully online, and then do
- * the binding.
- */
-static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
-{
-       struct sched_param sp;
-       struct task_struct *t;
-
-       if (!rcu_scheduler_fully_active ||
-           per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
-               return 0;
-       t = kthread_create_on_node(rcu_cpu_kthread,
-                                  (void *)(long)cpu,
-                                  cpu_to_node(cpu),
-                                  "rcuc/%d", cpu);
-       if (IS_ERR(t))
-               return PTR_ERR(t);
-       if (cpu_online(cpu))
-               kthread_bind(t, cpu);
-       per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
-       WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
-       sp.sched_priority = RCU_KTHREAD_PRIO;
-       sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
-       per_cpu(rcu_cpu_kthread_task, cpu) = t;
-       wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
-       return 0;
-}
-
-/*
- * Per-rcu_node kthread, which is in charge of waking up the per-CPU
- * kthreads when needed.  We ignore requests to wake up kthreads
- * for offline CPUs, which is OK because force_quiescent_state()
- * takes care of this case.
- */
-static int rcu_node_kthread(void *arg)
-{
-       int cpu;
-       unsigned long flags;
-       unsigned long mask;
-       struct rcu_node *rnp = (struct rcu_node *)arg;
-       struct sched_param sp;
-       struct task_struct *t;
-
-       for (;;) {
-               rnp->node_kthread_status = RCU_KTHREAD_WAITING;
-               rcu_wait(atomic_read(&rnp->wakemask) != 0);
-               rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
-               raw_spin_lock_irqsave(&rnp->lock, flags);
-               mask = atomic_xchg(&rnp->wakemask, 0);
-               rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
-               for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
-                       if ((mask & 0x1) == 0)
-                               continue;
-                       preempt_disable();
-                       t = per_cpu(rcu_cpu_kthread_task, cpu);
-                       if (!cpu_online(cpu) || t == NULL) {
-                               preempt_enable();
-                               continue;
-                       }
-                       per_cpu(rcu_cpu_has_work, cpu) = 1;
-                       sp.sched_priority = RCU_KTHREAD_PRIO;
-                       sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
-                       preempt_enable();
+               if (*workp == 0) {
+                       trace_rcu_utilization("End CPU kthread@rcu_wait");
+                       *statusp = RCU_KTHREAD_WAITING;
+                       return;
                 }
         }
-       /* NOTREACHED */
-       rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
-       return 0;
+       *statusp = RCU_KTHREAD_YIELDING;
+       trace_rcu_utilization("Start CPU kthread@rcu_yield");
+       schedule_timeout_interruptible(2);
+       trace_rcu_utilization("End CPU kthread@rcu_yield");
+       *statusp = RCU_KTHREAD_WAITING;
  }
  
  /*
@@ -1583,17 +1363,17 @@ static int rcu_node_kthread(void *arg)
   * no outgoing CPU.  If there are no CPUs left in the affinity set,
   * this function allows the kthread to execute on any CPU.
   */
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
  {
+       struct task_struct *t = rnp->boost_kthread_task;
+       unsigned long mask = rnp->qsmaskinit;
         cpumask_var_t cm;
         int cpu;
-       unsigned long mask = rnp->qsmaskinit;
  
-       if (rnp->node_kthread_task == NULL)
+       if (!t)
                 return;
-       if (!alloc_cpumask_var(&cm, GFP_KERNEL))
+       if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
                 return;
-       cpumask_clear(cm);
         for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
                 if ((mask & 0x1) && cpu != outgoingcpu)
                         cpumask_set_cpu(cpu, cm);
@@ -1603,62 +1383,36 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
                         cpumask_clear_cpu(cpu, cm);
                 WARN_ON_ONCE(cpumask_weight(cm) == 0);
         }
-       set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
-       rcu_boost_kthread_setaffinity(rnp, cm);
+       set_cpus_allowed_ptr(t, cm);
         free_cpumask_var(cm);
  }
  
-/*
- * Spawn a per-rcu_node kthread, setting priority and affinity.
- * Called during boot before online/offline can happen, or, if
- * during runtime, with the main CPU-hotplug locks held.  So only
- * one of these can be executing at a time.
- */
-static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
-                                               struct rcu_node *rnp)
-{
-       unsigned long flags;
-       int rnp_index = rnp - &rsp->node[0];
-       struct sched_param sp;
-       struct task_struct *t;
-
-       if (!rcu_scheduler_fully_active ||
-           rnp->qsmaskinit == 0)
-               return 0;
-       if (rnp->node_kthread_task == NULL) {
-               t = kthread_create(rcu_node_kthread, (void *)rnp,
-                                  "rcun/%d", rnp_index);
-               if (IS_ERR(t))
-                       return PTR_ERR(t);
-               raw_spin_lock_irqsave(&rnp->lock, flags);
-               rnp->node_kthread_task = t;
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
-               sp.sched_priority = 99;
-               sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
-               wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
-       }
-       return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
-}
+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
+       .store                  = &rcu_cpu_kthread_task,
+       .thread_should_run      = rcu_cpu_kthread_should_run,
+       .thread_fn              = rcu_cpu_kthread,
+       .thread_comm            = "rcuc/%u",
+       .setup                  = rcu_cpu_kthread_setup,
+       .park                   = rcu_cpu_kthread_park,
+};
  
  /*
   * Spawn all kthreads -- called as soon as the scheduler is running.
   */
  static int __init rcu_spawn_kthreads(void)
  {
-       int cpu;
         struct rcu_node *rnp;
+       int cpu;
  
         rcu_scheduler_fully_active = 1;
-       for_each_possible_cpu(cpu) {
+       for_each_possible_cpu(cpu)
                 per_cpu(rcu_cpu_has_work, cpu) = 0;
-               if (cpu_online(cpu))
-                       (void)rcu_spawn_one_cpu_kthread(cpu);
-       }
+       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
         rnp = rcu_get_root(rcu_state);
-       (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+       (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
         if (NUM_RCU_NODES > 1) {
                 rcu_for_each_leaf_node(rcu_state, rnp)
-                       (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+                       (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
         }
         return 0;
  }
@@ -1670,11 +1424,8 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
         struct rcu_node *rnp = rdp->mynode;
  
         /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
-       if (rcu_scheduler_fully_active) {
-               (void)rcu_spawn_one_cpu_kthread(cpu);
-               if (rnp->node_kthread_task == NULL)
-                       (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
-       }
+       if (rcu_scheduler_fully_active)
+               (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
  }
  
  #else /* #ifdef CONFIG_RCU_BOOST */
@@ -1698,19 +1449,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
  {
  }
  
-#ifdef CONFIG_HOTPLUG_CPU
-
-static void rcu_stop_cpu_kthread(int cpu)
-{
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
-{
-}
-
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
  {
  }
  
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c

index abffb486e94ed7ea581b54861f253ca8567bddda..31968931f14647c9d60250bf63ae3271875f10c5 100644 (file)
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -108,11 +108,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                         rdp->nxttail[RCU_WAIT_TAIL]],
                    ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
  #ifdef CONFIG_RCU_BOOST
-       seq_printf(m, " kt=%d/%c/%d ktl=%x",
+       seq_printf(m, " kt=%d/%c ktl=%x",
                    per_cpu(rcu_cpu_has_work, rdp->cpu),
                    convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
                                           rdp->cpu)),
-                  per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
                    per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
  #endif /* #ifdef CONFIG_RCU_BOOST */
         seq_printf(m, " b=%ld", rdp->blimit);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile

index 173ea52f3af0ba450c371e69bf93c8c42ee66eaa..f06d249e103b6587d78859a48354f5542d98176e 100644 (file)
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
  CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
  endif
  
-obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
+obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
  obj-$(CONFIG_SMP) += cpupri.o
  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
  obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index fbf1fd098dc6cca687f0e9296a931aa0425c6fee..c46a011ce5db89c76a0d466a0f7a965204a86aec 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -740,126 +740,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
         dequeue_task(rq, p, flags);
  }
  
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-
-/*
- * There are no locks covering percpu hardirq/softirq time.
- * They are only modified in account_system_vtime, on corresponding CPU
- * with interrupts disabled. So, writes are safe.
- * They are read and saved off onto struct rq in update_rq_clock().
- * This may result in other CPU reading this CPU's irq time and can
- * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value with a side effect of accounting a slice of irq time to wrong
- * task when irq is in progress while we read rq->clock. That is a worthy
- * compromise in place of having locks on each irq in account_system_time.
- */
-static DEFINE_PER_CPU(u64, cpu_hardirq_time);
-static DEFINE_PER_CPU(u64, cpu_softirq_time);
-
-static DEFINE_PER_CPU(u64, irq_start_time);
-static int sched_clock_irqtime;
-
-void enable_sched_clock_irqtime(void)
-{
-       sched_clock_irqtime = 1;
-}
-
-void disable_sched_clock_irqtime(void)
-{
-       sched_clock_irqtime = 0;
-}
-
-#ifndef CONFIG_64BIT
-static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
-
-static inline void irq_time_write_begin(void)
-{
-       __this_cpu_inc(irq_time_seq.sequence);
-       smp_wmb();
-}
-
-static inline void irq_time_write_end(void)
-{
-       smp_wmb();
-       __this_cpu_inc(irq_time_seq.sequence);
-}
-
-static inline u64 irq_time_read(int cpu)
-{
-       u64 irq_time;
-       unsigned seq;
-
-       do {
-               seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
-               irq_time = per_cpu(cpu_softirq_time, cpu) +
-                          per_cpu(cpu_hardirq_time, cpu);
-       } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
-
-       return irq_time;
-}
-#else /* CONFIG_64BIT */
-static inline void irq_time_write_begin(void)
-{
-}
-
-static inline void irq_time_write_end(void)
-{
-}
-
-static inline u64 irq_time_read(int cpu)
-{
-       return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
-}
-#endif /* CONFIG_64BIT */
-
-/*
- * Called before incrementing preempt_count on {soft,}irq_enter
- * and before decrementing preempt_count on {soft,}irq_exit.
- */
-void account_system_vtime(struct task_struct *curr)
-{
-       unsigned long flags;
-       s64 delta;
-       int cpu;
-
-       if (!sched_clock_irqtime)
-               return;
-
-       local_irq_save(flags);
-
-       cpu = smp_processor_id();
-       delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
-       __this_cpu_add(irq_start_time, delta);
-
-       irq_time_write_begin();
-       /*
-        * We do not account for softirq time from ksoftirqd here.
-        * We want to continue accounting softirq time to ksoftirqd thread
-        * in that case, so as not to confuse scheduler with a special task
-        * that do not consume any time, but still wants to run.
-        */
-       if (hardirq_count())
-               __this_cpu_add(cpu_hardirq_time, delta);
-       else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
-               __this_cpu_add(cpu_softirq_time, delta);
-
-       irq_time_write_end();
-       local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(account_system_vtime);
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-#ifdef CONFIG_PARAVIRT
-static inline u64 steal_ticks(u64 steal)
-{
-       if (unlikely(steal > NSEC_PER_SEC))
-               return div_u64(steal, TICK_NSEC);
-
-       return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
-}
-#endif
-
  static void update_rq_clock_task(struct rq *rq, s64 delta)
  {
  /*
@@ -920,43 +800,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
  #endif
  }
  
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-static int irqtime_account_hi_update(void)
-{
-       u64 *cpustat = kcpustat_this_cpu->cpustat;
-       unsigned long flags;
-       u64 latest_ns;
-       int ret = 0;
-
-       local_irq_save(flags);
-       latest_ns = this_cpu_read(cpu_hardirq_time);
-       if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
-               ret = 1;
-       local_irq_restore(flags);
-       return ret;
-}
-
-static int irqtime_account_si_update(void)
-{
-       u64 *cpustat = kcpustat_this_cpu->cpustat;
-       unsigned long flags;
-       u64 latest_ns;
-       int ret = 0;
-
-       local_irq_save(flags);
-       latest_ns = this_cpu_read(cpu_softirq_time);
-       if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
-               ret = 1;
-       local_irq_restore(flags);
-       return ret;
-}
-
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-#define sched_clock_irqtime    (0)
-
-#endif
-
  void sched_set_stop_task(int cpu, struct task_struct *stop)
  {
         struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -1953,6 +1796,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
          *              Manfred Spraul <manfred@colorfullife.com>
          */
         prev_state = prev->state;
+       account_switch_vtime(prev);
         finish_arch_switch(prev);
  #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
         local_irq_disable();
@@ -2809,404 +2653,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
         return ns;
  }
  
-#ifdef CONFIG_CGROUP_CPUACCT
-struct cgroup_subsys cpuacct_subsys;
-struct cpuacct root_cpuacct;
-#endif
-
-static inline void task_group_account_field(struct task_struct *p, int index,
-                                           u64 tmp)
-{
-#ifdef CONFIG_CGROUP_CPUACCT
-       struct kernel_cpustat *kcpustat;
-       struct cpuacct *ca;
-#endif
-       /*
-        * Since all updates are sure to touch the root cgroup, we
-        * get ourselves ahead and touch it first. If the root cgroup
-        * is the only cgroup, then nothing else should be necessary.
-        *
-        */
-       __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
-
-#ifdef CONFIG_CGROUP_CPUACCT
-       if (unlikely(!cpuacct_subsys.active))
-               return;
-
-       rcu_read_lock();
-       ca = task_ca(p);
-       while (ca && (ca != &root_cpuacct)) {
-               kcpustat = this_cpu_ptr(ca->cpustat);
-               kcpustat->cpustat[index] += tmp;
-               ca = parent_ca(ca);
-       }
-       rcu_read_unlock();
-#endif
-}
-
-
-/*
- * Account user cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-void account_user_time(struct task_struct *p, cputime_t cputime,
-                      cputime_t cputime_scaled)
-{
-       int index;
-
-       /* Add user time to process. */
-       p->utime += cputime;
-       p->utimescaled += cputime_scaled;
-       account_group_user_time(p, cputime);
-
-       index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
-
-       /* Add user time to cpustat. */
-       task_group_account_field(p, index, (__force u64) cputime);
-
-       /* Account for user time used */
-       acct_update_integrals(p);
-}
-
-/*
- * Account guest cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in virtual machine since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-static void account_guest_time(struct task_struct *p, cputime_t cputime,
-                              cputime_t cputime_scaled)
-{
-       u64 *cpustat = kcpustat_this_cpu->cpustat;
-
-       /* Add guest time to process. */
-       p->utime += cputime;
-       p->utimescaled += cputime_scaled;
-       account_group_user_time(p, cputime);
-       p->gtime += cputime;
-
-       /* Add guest time to cpustat. */
-       if (TASK_NICE(p) > 0) {
-               cpustat[CPUTIME_NICE] += (__force u64) cputime;
-               cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
-       } else {
-               cpustat[CPUTIME_USER] += (__force u64) cputime;
-               cpustat[CPUTIME_GUEST] += (__force u64) cputime;
-       }
-}
-
-/*
- * Account system cpu time to a process and desired cpustat field
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * @target_cputime64: pointer to cpustat field that has to be updated
- */
-static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime,
-                       cputime_t cputime_scaled, int index)
-{
-       /* Add system time to process. */
-       p->stime += cputime;
-       p->stimescaled += cputime_scaled;
-       account_group_system_time(p, cputime);
-
-       /* Add system time to cpustat. */
-       task_group_account_field(p, index, (__force u64) cputime);
-
-       /* Account for system time used */
-       acct_update_integrals(p);
-}
-
-/*
- * Account system cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @hardirq_offset: the offset to subtract from hardirq_count()
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-void account_system_time(struct task_struct *p, int hardirq_offset,
-                        cputime_t cputime, cputime_t cputime_scaled)
-{
-       int index;
-
-       if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
-               account_guest_time(p, cputime, cputime_scaled);
-               return;
-       }
-
-       if (hardirq_count() - hardirq_offset)
-               index = CPUTIME_IRQ;
-       else if (in_serving_softirq())
-               index = CPUTIME_SOFTIRQ;
-       else
-               index = CPUTIME_SYSTEM;
-
-       __account_system_time(p, cputime, cputime_scaled, index);
-}
-
-/*
- * Account for involuntary wait time.
- * @cputime: the cpu time spent in involuntary wait
- */
-void account_steal_time(cputime_t cputime)
-{
-       u64 *cpustat = kcpustat_this_cpu->cpustat;
-
-       cpustat[CPUTIME_STEAL] += (__force u64) cputime;
-}
-
-/*
- * Account for idle time.
- * @cputime: the cpu time spent in idle wait
- */
-void account_idle_time(cputime_t cputime)
-{
-       u64 *cpustat = kcpustat_this_cpu->cpustat;
-       struct rq *rq = this_rq();
-
-       if (atomic_read(&rq->nr_iowait) > 0)
-               cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
-       else
-               cpustat[CPUTIME_IDLE] += (__force u64) cputime;
-}
-
-static __always_inline bool steal_account_process_tick(void)
-{
-#ifdef CONFIG_PARAVIRT
-       if (static_key_false(&paravirt_steal_enabled)) {
-               u64 steal, st = 0;
-
-               steal = paravirt_steal_clock(smp_processor_id());
-               steal -= this_rq()->prev_steal_time;
-
-               st = steal_ticks(steal);
-               this_rq()->prev_steal_time += st * TICK_NSEC;
-
-               account_steal_time(st);
-               return st;
-       }
-#endif
-       return false;
-}
-
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-/*
- * Account a tick to a process and cpustat
- * @p: the process that the cpu time gets accounted to
- * @user_tick: is the tick from userspace
- * @rq: the pointer to rq
- *
- * Tick demultiplexing follows the order
- * - pending hardirq update
- * - pending softirq update
- * - user_time
- * - idle_time
- * - system time
- *   - check for guest_time
- *   - else account as system_time
- *
- * Check for hardirq is done both for system and user time as there is
- * no timer going off while we are on hardirq and hence we may never get an
- * opportunity to update it solely in system time.
- * p->stime and friends are only updated on system time and not on irq
- * softirq as those do not count in task exec_runtime any more.
- */
-static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-                                               struct rq *rq)
-{
-       cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
-       u64 *cpustat = kcpustat_this_cpu->cpustat;
-
-       if (steal_account_process_tick())
-               return;
-
-       if (irqtime_account_hi_update()) {
-               cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
-       } else if (irqtime_account_si_update()) {
-               cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
-       } else if (this_cpu_ksoftirqd() == p) {
-               /*
-                * ksoftirqd time do not get accounted in cpu_softirq_time.
-                * So, we have to handle it separately here.
-                * Also, p->stime needs to be updated for ksoftirqd.
-                */
-               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-                                       CPUTIME_SOFTIRQ);
-       } else if (user_tick) {
-               account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
-       } else if (p == rq->idle) {
-               account_idle_time(cputime_one_jiffy);
-       } else if (p->flags & PF_VCPU) { /* System time or guest time */
-               account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
-       } else {
-               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-                                       CPUTIME_SYSTEM);
-       }
-}
-
-static void irqtime_account_idle_ticks(int ticks)
-{
-       int i;
-       struct rq *rq = this_rq();
-
-       for (i = 0; i < ticks; i++)
-               irqtime_account_process_tick(current, 0, rq);
-}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static void irqtime_account_idle_ticks(int ticks) {}
-static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-                                               struct rq *rq) {}
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-/*
- * Account a single tick of cpu time.
- * @p: the process that the cpu time gets accounted to
- * @user_tick: indicates if the tick is a user or a system tick
- */
-void account_process_tick(struct task_struct *p, int user_tick)
-{
-       cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
-       struct rq *rq = this_rq();
-
-       if (sched_clock_irqtime) {
-               irqtime_account_process_tick(p, user_tick, rq);
-               return;
-       }
-
-       if (steal_account_process_tick())
-               return;
-
-       if (user_tick)
-               account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
-       else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
-               account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
-                                   one_jiffy_scaled);
-       else
-               account_idle_time(cputime_one_jiffy);
-}
-
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
-       account_steal_time(jiffies_to_cputime(ticks));
-}
-
-/*
- * Account multiple ticks of idle time.
- * @ticks: number of stolen ticks
- */
-void account_idle_ticks(unsigned long ticks)
-{
-
-       if (sched_clock_irqtime) {
-               irqtime_account_idle_ticks(ticks);
-               return;
-       }
-
-       account_idle_time(jiffies_to_cputime(ticks));
-}
-
-#endif
-
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-       *ut = p->utime;
-       *st = p->stime;
-}
-
-void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-       struct task_cputime cputime;
-
-       thread_group_cputime(p, &cputime);
-
-       *ut = cputime.utime;
-       *st = cputime.stime;
-}
-#else
-
-#ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs)     nsecs_to_jiffies(__nsecs)
-#endif
-
-static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
-{
-       u64 temp = (__force u64) rtime;
-
-       temp *= (__force u64) utime;
-
-       if (sizeof(cputime_t) == 4)
-               temp = div_u64(temp, (__force u32) total);
-       else
-               temp = div64_u64(temp, (__force u64) total);
-
-       return (__force cputime_t) temp;
-}
-
-void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-       cputime_t rtime, utime = p->utime, total = utime + p->stime;
-
-       /*
-        * Use CFS's precise accounting:
-        */
-       rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
-
-       if (total)
-               utime = scale_utime(utime, rtime, total);
-       else
-               utime = rtime;
-
-       /*
-        * Compare with previous values, to keep monotonicity:
-        */
-       p->prev_utime = max(p->prev_utime, utime);
-       p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
-
-       *ut = p->prev_utime;
-       *st = p->prev_stime;
-}
-
-/*
- * Must be called with siglock held.
- */
-void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-       struct signal_struct *sig = p->signal;
-       struct task_cputime cputime;
-       cputime_t rtime, utime, total;
-
-       thread_group_cputime(p, &cputime);
-
-       total = cputime.utime + cputime.stime;
-       rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
-
-       if (total)
-               utime = scale_utime(cputime.utime, rtime, total);
-       else
-               utime = rtime;
-
-       sig->prev_utime = max(sig->prev_utime, utime);
-       sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
-
-       *ut = sig->prev_utime;
-       *st = sig->prev_stime;
-}
-#endif
-
  /*
   * This function gets called by the timer code, with HZ frequency.
   * We call it with interrupts disabled.
@@ -3367,6 +2813,40 @@ pick_next_task(struct rq *rq)
  
  /*
   * __schedule() is the main scheduler function.
+ *
+ * The main means of driving the scheduler and thus entering this function are:
+ *
+ *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
+ *
+ *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
+ *      paths. For example, see arch/x86/entry_64.S.
+ *
+ *      To drive preemption between tasks, the scheduler sets the flag in timer
+ *      interrupt handler scheduler_tick().
+ *
+ *   3. Wakeups don't really cause entry into schedule(). They add a
+ *      task to the run-queue and that's it.
+ *
+ *      Now, if the new task added to the run-queue preempts the current
+ *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
+ *      called on the nearest possible occasion:
+ *
+ *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ *
+ *         - in syscall or exception context, at the next outmost
+ *           preempt_enable(). (this might be as soon as the wake_up()'s
+ *           spin_unlock()!)
+ *
+ *         - in IRQ context, return from interrupt-handler to
+ *           preemptible context
+ *
+ *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ *         then at the next:
+ *
+ *          - cond_resched() call
+ *          - explicit schedule() call
+ *          - return from syscall or exception to user-space
+ *          - return from interrupt-handler to user-space
   */
  static void __sched __schedule(void)
  {
@@ -4868,13 +4348,6 @@ again:
                  */
                 if (preempt && rq != p_rq)
                         resched_task(p_rq->curr);
-       } else {
-               /*
-                * We might have set it in task_yield_fair(), but are
-                * not going to schedule(), so don't want to skip
-                * the next update.
-                */
-               rq->skip_clock_update = 0;
         }
  
  out:
@@ -5304,27 +4777,17 @@ void idle_task_exit(void)
  }
  
  /*
- * While a dead CPU has no uninterruptible tasks queued at this point,
- * it might still have a nonzero ->nr_uninterruptible counter, because
- * for performance reasons the counter is not stricly tracking tasks to
- * their home CPUs. So we just add the counter to another CPU's counter,
- * to keep the global sum constant after CPU-down:
- */
-static void migrate_nr_uninterruptible(struct rq *rq_src)
-{
-       struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
-
-       rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
-       rq_src->nr_uninterruptible = 0;
-}
-
-/*
- * remove the tasks which were accounted by rq from calc_load_tasks.
+ * Since this CPU is going 'away' for a while, fold any nr_active delta
+ * we might have. Assumes we're called after migrate_tasks() so that the
+ * nr_active count is stable.
+ *
+ * Also see the comment "Global load-average calculations".
   */
-static void calc_global_load_remove(struct rq *rq)
+static void calc_load_migrate(struct rq *rq)
  {
-       atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
-       rq->calc_load_active = 0;
+       long delta = calc_load_fold_active(rq);
+       if (delta)
+               atomic_long_add(delta, &calc_load_tasks);
  }
  
  /*
@@ -5352,9 +4815,6 @@ static void migrate_tasks(unsigned int dead_cpu)
          */
         rq->stop = NULL;
  
-       /* Ensure any throttled groups are reachable by pick_next_task */
-       unthrottle_offline_cfs_rqs(rq);
-
         for ( ; ; ) {
                 /*
                  * There's this thread running, bail when that's the only
@@ -5429,16 +4889,25 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
         *tablep = NULL;
  }
  
+static int min_load_idx = 0;
+static int max_load_idx = CPU_LOAD_IDX_MAX;
+
  static void
  set_table_entry(struct ctl_table *entry,
                 const char *procname, void *data, int maxlen,
-               umode_t mode, proc_handler *proc_handler)
+               umode_t mode, proc_handler *proc_handler,
+               bool load_idx)
  {
         entry->procname = procname;
         entry->data = data;
         entry->maxlen = maxlen;
         entry->mode = mode;
         entry->proc_handler = proc_handler;
+
+       if (load_idx) {
+               entry->extra1 = &min_load_idx;
+               entry->extra2 = &max_load_idx;
+       }
  }
  
  static struct ctl_table *
@@ -5450,30 +4919,30 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
                 return NULL;
  
         set_table_entry(&table[0], "min_interval", &sd->min_interval,
-               sizeof(long), 0644, proc_doulongvec_minmax);
+               sizeof(long), 0644, proc_doulongvec_minmax, false);
         set_table_entry(&table[1], "max_interval", &sd->max_interval,
-               sizeof(long), 0644, proc_doulongvec_minmax);
+               sizeof(long), 0644, proc_doulongvec_minmax, false);
         set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
-               sizeof(int), 0644, proc_dointvec_minmax);
+               sizeof(int), 0644, proc_dointvec_minmax, true);
         set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
-               sizeof(int), 0644, proc_dointvec_minmax);
+               sizeof(int), 0644, proc_dointvec_minmax, true);
         set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
-               sizeof(int), 0644, proc_dointvec_minmax);
+               sizeof(int), 0644, proc_dointvec_minmax, true);
         set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
-               sizeof(int), 0644, proc_dointvec_minmax);
+               sizeof(int), 0644, proc_dointvec_minmax, true);
         set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
-               sizeof(int), 0644, proc_dointvec_minmax);
+               sizeof(int), 0644, proc_dointvec_minmax, true);
         set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
-               sizeof(int), 0644, proc_dointvec_minmax);
+               sizeof(int), 0644, proc_dointvec_minmax, false);
         set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
-               sizeof(int), 0644, proc_dointvec_minmax);
+               sizeof(int), 0644, proc_dointvec_minmax, false);
         set_table_entry(&table[9], "cache_nice_tries",
                 &sd->cache_nice_tries,
-               sizeof(int), 0644, proc_dointvec_minmax);
+               sizeof(int), 0644, proc_dointvec_minmax, false);
         set_table_entry(&table[10], "flags", &sd->flags,
-               sizeof(int), 0644, proc_dointvec_minmax);
+               sizeof(int), 0644, proc_dointvec_minmax, false);
         set_table_entry(&table[11], "name", sd->name,
-               CORENAME_MAX_SIZE, 0444, proc_dostring);
+               CORENAME_MAX_SIZE, 0444, proc_dostring, false);
         /* &table[12] is terminator */
  
         return table;
@@ -5618,8 +5087,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 BUG_ON(rq->nr_running != 1); /* the migration thread */
                 raw_spin_unlock_irqrestore(&rq->lock, flags);
  
-               migrate_nr_uninterruptible(rq);
-               calc_global_load_remove(rq);
+               calc_load_migrate(rq);
                 break;
  #endif
         }
@@ -6588,7 +6056,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
                                         | 0*SD_BALANCE_FORK
                                         | 0*SD_BALANCE_WAKE
                                         | 0*SD_WAKE_AFFINE
-                                       | 0*SD_PREFER_LOCAL
                                         | 0*SD_SHARE_CPUPOWER
                                         | 0*SD_SHARE_PKG_RESOURCES
                                         | 1*SD_SERIALIZE
@@ -8386,6 +7853,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
   * (balbir@in.ibm.com).
   */
  
+struct cpuacct root_cpuacct;
+
  /* create a new cpu accounting group */
  static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
  {
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c

new file mode 100644 (file)

index 0000000..372692b
--- /dev/null
+++ b/kernel/sched/cputime.c
@@ -0,0 +1,504 @@
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/tsacct_kern.h>
+#include <linux/kernel_stat.h>
+#include <linux/static_key.h>
+#include "sched.h"
+
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
+ */
+DEFINE_PER_CPU(u64, cpu_hardirq_time);
+DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+       sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+       sched_clock_irqtime = 0;
+}
+
+#ifndef CONFIG_64BIT
+DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+#endif /* CONFIG_64BIT */
+
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
+void account_system_vtime(struct task_struct *curr)
+{
+       unsigned long flags;
+       s64 delta;
+       int cpu;
+
+       if (!sched_clock_irqtime)
+               return;
+
+       local_irq_save(flags);
+
+       cpu = smp_processor_id();
+       delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+       __this_cpu_add(irq_start_time, delta);
+
+       irq_time_write_begin();
+       /*
+        * We do not account for softirq time from ksoftirqd here.
+        * We want to continue accounting softirq time to ksoftirqd thread
+        * in that case, so as not to confuse scheduler with a special task
+        * that do not consume any time, but still wants to run.
+        */
+       if (hardirq_count())
+               __this_cpu_add(cpu_hardirq_time, delta);
+       else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+               __this_cpu_add(cpu_softirq_time, delta);
+
+       irq_time_write_end();
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+
+static int irqtime_account_hi_update(void)
+{
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
+       unsigned long flags;
+       u64 latest_ns;
+       int ret = 0;
+
+       local_irq_save(flags);
+       latest_ns = this_cpu_read(cpu_hardirq_time);
+       if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
+               ret = 1;
+       local_irq_restore(flags);
+       return ret;
+}
+
+static int irqtime_account_si_update(void)
+{
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
+       unsigned long flags;
+       u64 latest_ns;
+       int ret = 0;
+
+       local_irq_save(flags);
+       latest_ns = this_cpu_read(cpu_softirq_time);
+       if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
+               ret = 1;
+       local_irq_restore(flags);
+       return ret;
+}
+
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#define sched_clock_irqtime    (0)
+
+#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
+
+static inline void task_group_account_field(struct task_struct *p, int index,
+                                           u64 tmp)
+{
+#ifdef CONFIG_CGROUP_CPUACCT
+       struct kernel_cpustat *kcpustat;
+       struct cpuacct *ca;
+#endif
+       /*
+        * Since all updates are sure to touch the root cgroup, we
+        * get ourselves ahead and touch it first. If the root cgroup
+        * is the only cgroup, then nothing else should be necessary.
+        *
+        */
+       __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
+
+#ifdef CONFIG_CGROUP_CPUACCT
+       if (unlikely(!cpuacct_subsys.active))
+               return;
+
+       rcu_read_lock();
+       ca = task_ca(p);
+       while (ca && (ca != &root_cpuacct)) {
+               kcpustat = this_cpu_ptr(ca->cpustat);
+               kcpustat->cpustat[index] += tmp;
+               ca = parent_ca(ca);
+       }
+       rcu_read_unlock();
+#endif
+}
+
+/*
+ * Account user cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in user space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+void account_user_time(struct task_struct *p, cputime_t cputime,
+                      cputime_t cputime_scaled)
+{
+       int index;
+
+       /* Add user time to process. */
+       p->utime += cputime;
+       p->utimescaled += cputime_scaled;
+       account_group_user_time(p, cputime);
+
+       index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+
+       /* Add user time to cpustat. */
+       task_group_account_field(p, index, (__force u64) cputime);
+
+       /* Account for user time used */
+       acct_update_integrals(p);
+}
+
+/*
+ * Account guest cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in virtual machine since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+static void account_guest_time(struct task_struct *p, cputime_t cputime,
+                              cputime_t cputime_scaled)
+{
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+       /* Add guest time to process. */
+       p->utime += cputime;
+       p->utimescaled += cputime_scaled;
+       account_group_user_time(p, cputime);
+       p->gtime += cputime;
+
+       /* Add guest time to cpustat. */
+       if (TASK_NICE(p) > 0) {
+               cpustat[CPUTIME_NICE] += (__force u64) cputime;
+               cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
+       } else {
+               cpustat[CPUTIME_USER] += (__force u64) cputime;
+               cpustat[CPUTIME_GUEST] += (__force u64) cputime;
+       }
+}
+
+/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+                       cputime_t cputime_scaled, int index)
+{
+       /* Add system time to process. */
+       p->stime += cputime;
+       p->stimescaled += cputime_scaled;
+       account_group_system_time(p, cputime);
+
+       /* Add system time to cpustat. */
+       task_group_account_field(p, index, (__force u64) cputime);
+
+       /* Account for system time used */
+       acct_update_integrals(p);
+}
+
+/*
+ * Account system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+void account_system_time(struct task_struct *p, int hardirq_offset,
+                        cputime_t cputime, cputime_t cputime_scaled)
+{
+       int index;
+
+       if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
+               account_guest_time(p, cputime, cputime_scaled);
+               return;
+       }
+
+       if (hardirq_count() - hardirq_offset)
+               index = CPUTIME_IRQ;
+       else if (in_serving_softirq())
+               index = CPUTIME_SOFTIRQ;
+       else
+               index = CPUTIME_SYSTEM;
+
+       __account_system_time(p, cputime, cputime_scaled, index);
+}
+
+/*
+ * Account for involuntary wait time.
+ * @cputime: the cpu time spent in involuntary wait
+ */
+void account_steal_time(cputime_t cputime)
+{
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+       cpustat[CPUTIME_STEAL] += (__force u64) cputime;
+}
+
+/*
+ * Account for idle time.
+ * @cputime: the cpu time spent in idle wait
+ */
+void account_idle_time(cputime_t cputime)
+{
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
+       struct rq *rq = this_rq();
+
+       if (atomic_read(&rq->nr_iowait) > 0)
+               cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
+       else
+               cpustat[CPUTIME_IDLE] += (__force u64) cputime;
+}
+
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+       if (static_key_false(&paravirt_steal_enabled)) {
+               u64 steal, st = 0;
+
+               steal = paravirt_steal_clock(smp_processor_id());
+               steal -= this_rq()->prev_steal_time;
+
+               st = steal_ticks(steal);
+               this_rq()->prev_steal_time += st * TICK_NSEC;
+
+               account_steal_time(st);
+               return st;
+       }
+#endif
+       return false;
+}
+
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ *   - check for guest_time
+ *   - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+                                               struct rq *rq)
+{
+       cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+       if (steal_account_process_tick())
+               return;
+
+       if (irqtime_account_hi_update()) {
+               cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
+       } else if (irqtime_account_si_update()) {
+               cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
+       } else if (this_cpu_ksoftirqd() == p) {
+               /*
+                * ksoftirqd time do not get accounted in cpu_softirq_time.
+                * So, we have to handle it separately here.
+                * Also, p->stime needs to be updated for ksoftirqd.
+                */
+               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                                       CPUTIME_SOFTIRQ);
+       } else if (user_tick) {
+               account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+       } else if (p == rq->idle) {
+               account_idle_time(cputime_one_jiffy);
+       } else if (p->flags & PF_VCPU) { /* System time or guest time */
+               account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+       } else {
+               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                                       CPUTIME_SYSTEM);
+       }
+}
+
+static void irqtime_account_idle_ticks(int ticks)
+{
+       int i;
+       struct rq *rq = this_rq();
+
+       for (i = 0; i < ticks; i++)
+               irqtime_account_process_tick(current, 0, rq);
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+static void irqtime_account_idle_ticks(int ticks) {}
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+                                               struct rq *rq) {}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+/*
+ * Account a single tick of cpu time.
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: indicates if the tick is a user or a system tick
+ */
+void account_process_tick(struct task_struct *p, int user_tick)
+{
+       cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+       struct rq *rq = this_rq();
+
+       if (sched_clock_irqtime) {
+               irqtime_account_process_tick(p, user_tick, rq);
+               return;
+       }
+
+       if (steal_account_process_tick())
+               return;
+
+       if (user_tick)
+               account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+       else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
+               account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
+                                   one_jiffy_scaled);
+       else
+               account_idle_time(cputime_one_jiffy);
+}
+
+/*
+ * Account multiple ticks of steal time.
+ * @p: the process from which the cpu time has been stolen
+ * @ticks: number of stolen ticks
+ */
+void account_steal_ticks(unsigned long ticks)
+{
+       account_steal_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Account multiple ticks of idle time.
+ * @ticks: number of stolen ticks
+ */
+void account_idle_ticks(unsigned long ticks)
+{
+
+       if (sched_clock_irqtime) {
+               irqtime_account_idle_ticks(ticks);
+               return;
+       }
+
+       account_idle_time(jiffies_to_cputime(ticks));
+}
+
+#endif
+
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+       *ut = p->utime;
+       *st = p->stime;
+}
+
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+       struct task_cputime cputime;
+
+       thread_group_cputime(p, &cputime);
+
+       *ut = cputime.utime;
+       *st = cputime.stime;
+}
+#else
+
+#ifndef nsecs_to_cputime
+# define nsecs_to_cputime(__nsecs)     nsecs_to_jiffies(__nsecs)
+#endif
+
+static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
+{
+       u64 temp = (__force u64) rtime;
+
+       temp *= (__force u64) utime;
+
+       if (sizeof(cputime_t) == 4)
+               temp = div_u64(temp, (__force u32) total);
+       else
+               temp = div64_u64(temp, (__force u64) total);
+
+       return (__force cputime_t) temp;
+}
+
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+       cputime_t rtime, utime = p->utime, total = utime + p->stime;
+
+       /*
+        * Use CFS's precise accounting:
+        */
+       rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
+
+       if (total)
+               utime = scale_utime(utime, rtime, total);
+       else
+               utime = rtime;
+
+       /*
+        * Compare with previous values, to keep monotonicity:
+        */
+       p->prev_utime = max(p->prev_utime, utime);
+       p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
+
+       *ut = p->prev_utime;
+       *st = p->prev_stime;
+}
+
+/*
+ * Must be called with siglock held.
+ */
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+       struct signal_struct *sig = p->signal;
+       struct task_cputime cputime;
+       cputime_t rtime, utime, total;
+
+       thread_group_cputime(p, &cputime);
+
+       total = cputime.utime + cputime.stime;
+       rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
+
+       if (total)
+               utime = scale_utime(cputime.utime, rtime, total);
+       else
+               utime = rtime;
+
+       sig->prev_utime = max(sig->prev_utime, utime);
+       sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
+
+       *ut = sig->prev_utime;
+       *st = sig->prev_stime;
+}
+#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index c219bf8d704c5460291abee8416264ee32d36e22..1ca4fe4235289392dbfeaf66ccaf98609ddc4414 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -597,7 +597,7 @@ calc_delta_fair(unsigned long delta, struct sched_entity *se)
  /*
   * The idea is to set a period in which each task runs once.
   *
- * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
+ * When there are too many tasks (sched_nr_latency) we have to stretch
   * this period because otherwise the slices get too small.
   *
   * p = (nr <= nl) ? l : l*nr/nl
@@ -2052,7 +2052,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
         hrtimer_cancel(&cfs_b->slack_timer);
  }
  
-void unthrottle_offline_cfs_rqs(struct rq *rq)
+static void unthrottle_offline_cfs_rqs(struct rq *rq)
  {
         struct cfs_rq *cfs_rq;
  
@@ -2106,7 +2106,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
         return NULL;
  }
  static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
  
  #endif /* CONFIG_CFS_BANDWIDTH */
  
@@ -2686,7 +2686,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
         int prev_cpu = task_cpu(p);
         int new_cpu = cpu;
         int want_affine = 0;
-       int want_sd = 1;
         int sync = wake_flags & WF_SYNC;
  
         if (p->nr_cpus_allowed == 1)
@@ -2703,27 +2702,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
                 if (!(tmp->flags & SD_LOAD_BALANCE))
                         continue;
  
-               /*
-                * If power savings logic is enabled for a domain, see if we
-                * are not overloaded, if so, don't balance wider.
-                */
-               if (tmp->flags & (SD_PREFER_LOCAL)) {
-                       unsigned long power = 0;
-                       unsigned long nr_running = 0;
-                       unsigned long capacity;
-                       int i;
-
-                       for_each_cpu(i, sched_domain_span(tmp)) {
-                               power += power_of(i);
-                               nr_running += cpu_rq(i)->cfs.nr_running;
-                       }
-
-                       capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
-
-                       if (nr_running < capacity)
-                               want_sd = 0;
-               }
-
                 /*
                  * If both cpu and prev_cpu are part of this domain,
                  * cpu is a valid SD_WAKE_AFFINE target.
@@ -2731,21 +2709,15 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
                 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
                     cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
                         affine_sd = tmp;
-                       want_affine = 0;
-               }
-
-               if (!want_sd && !want_affine)
                         break;
+               }
  
-               if (!(tmp->flags & sd_flag))
-                       continue;
-
-               if (want_sd)
+               if (tmp->flags & sd_flag)
                         sd = tmp;
         }
  
         if (affine_sd) {
-               if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+               if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
                         prev_cpu = cpu;
  
                 new_cpu = select_idle_sibling(p, prev_cpu);
@@ -3658,7 +3630,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
   * @group: sched_group whose statistics are to be updated.
   * @load_idx: Load index of sched_domain of this_cpu for load calc.
   * @local_group: Does group contain this_cpu.
- * @cpus: Set of cpus considered for load balancing.
   * @balance: Should we balance.
   * @sgs: variable to hold the statistics for this group.
   */
@@ -3805,7 +3776,6 @@ static bool update_sd_pick_busiest(struct lb_env *env,
  /**
   * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
   * @env: The load balancing environment.
- * @cpus: Set of cpus considered for load balancing.
   * @balance: Should we balance.
   * @sds: variable to hold the statistics for this sched_domain.
   */
@@ -4283,7 +4253,7 @@ redo:
                 goto out_balanced;
         }
  
-       BUG_ON(busiest == this_rq);
+       BUG_ON(busiest == env.dst_rq);
  
         schedstat_add(sd, lb_imbalance[idle], env.imbalance);
  
@@ -4304,7 +4274,7 @@ redo:
                 update_h_load(env.src_cpu);
  more_balance:
                 local_irq_save(flags);
-               double_rq_lock(this_rq, busiest);
+               double_rq_lock(env.dst_rq, busiest);
  
                 /*
                  * cur_ld_moved - load moved in current iteration
@@ -4312,7 +4282,7 @@ more_balance:
                  */
                 cur_ld_moved = move_tasks(&env);
                 ld_moved += cur_ld_moved;
-               double_rq_unlock(this_rq, busiest);
+               double_rq_unlock(env.dst_rq, busiest);
                 local_irq_restore(flags);
  
                 if (env.flags & LBF_NEED_BREAK) {
@@ -4348,8 +4318,7 @@ more_balance:
                 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
                                 lb_iterations++ < max_lb_iterations) {
  
-                       this_rq          = cpu_rq(env.new_dst_cpu);
-                       env.dst_rq       = this_rq;
+                       env.dst_rq       = cpu_rq(env.new_dst_cpu);
                         env.dst_cpu      = env.new_dst_cpu;
                         env.flags       &= ~LBF_SOME_PINNED;
                         env.loop         = 0;
@@ -4956,6 +4925,9 @@ static void rq_online_fair(struct rq *rq)
  static void rq_offline_fair(struct rq *rq)
  {
         update_sysctl();
+
+       /* Ensure any throttled groups are reachable by pick_next_task */
+       unthrottle_offline_cfs_rqs(rq);
  }
  
  #endif /* CONFIG_SMP */
diff --git a/kernel/sched/features.h b/kernel/sched/features.h

index de00a486c5c693ac7038fa3c5dd139e642a1c307..c38f52ea53dd665825beb35cf05ea2e7c74ae572 100644 (file)
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -11,14 +11,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
   */
  SCHED_FEAT(START_DEBIT, true)
  
-/*
- * Based on load and program behaviour, see if it makes sense to place
- * a newly woken task on the same cpu as the task that woke it --
- * improve cache locality. Typically used with SYNC wakeups as
- * generated by pipes and the like, see also SYNC_WAKEUPS.
- */
-SCHED_FEAT(AFFINE_WAKEUPS, true)
-
  /*
   * Prefer to schedule the task we woke last (assuming it failed
   * wakeup-preemption), since its likely going to consume data we
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c

index 944cb68420e957cbde71f9cacaaa4e81c4b1de20..e0b7ba9c040f74b22bb63e0d957b672dac4adce0 100644 (file)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -691,6 +691,7 @@ balanced:
                  * runtime - in which case borrowing doesn't make sense.
                  */
                 rt_rq->rt_runtime = RUNTIME_INF;
+               rt_rq->rt_throttled = 0;
                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
                 raw_spin_unlock(&rt_b->rt_runtime_lock);
         }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index f6714d009e779a225ef295d33ceff7f2f8573be8..09871698e80c26d7a7b9c41739198957916bf574 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -891,6 +891,9 @@ struct cpuacct {
         struct kernel_cpustat __percpu *cpustat;
  };
  
+extern struct cgroup_subsys cpuacct_subsys;
+extern struct cpuacct root_cpuacct;
+
  /* return cpu accounting group corresponding to this container */
  static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
  {
@@ -917,6 +920,16 @@ extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
  static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
  #endif
  
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
+{
+       if (unlikely(steal > NSEC_PER_SEC))
+               return div_u64(steal, TICK_NSEC);
+
+       return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
+
  static inline void inc_nr_running(struct rq *rq)
  {
         rq->nr_running++;
@@ -1144,7 +1157,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
  
  extern void init_cfs_rq(struct cfs_rq *cfs_rq);
  extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void unthrottle_offline_cfs_rqs(struct rq *rq);
  
  extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
  
@@ -1157,3 +1169,53 @@ enum rq_nohz_flag_bits {
  
  #define nohz_flags(cpu)        (&cpu_rq(cpu)->nohz_flags)
  #endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+DECLARE_PER_CPU(u64, cpu_hardirq_time);
+DECLARE_PER_CPU(u64, cpu_softirq_time);
+
+#ifndef CONFIG_64BIT
+DECLARE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
+{
+       __this_cpu_inc(irq_time_seq.sequence);
+       smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+       smp_wmb();
+       __this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+       u64 irq_time;
+       unsigned seq;
+
+       do {
+               seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+               irq_time = per_cpu(cpu_softirq_time, cpu) +
+                          per_cpu(cpu_hardirq_time, cpu);
+       } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+       return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+       return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+#endif /* CONFIG_64BIT */
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
diff --git a/kernel/smpboot.c b/kernel/smpboot.c

index 98f60c5caa1bef91733a4c0bcc646da4fa0b7a24..d6c5fc0542428dae4fffc7546e06b0f6dd4ac4f5 100644 (file)
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -1,14 +1,22 @@
  /*
   * Common SMP CPU bringup/teardown functions
   */
+#include <linux/cpu.h>
  #include <linux/err.h>
  #include <linux/smp.h>
  #include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
  #include <linux/sched.h>
+#include <linux/export.h>
  #include <linux/percpu.h>
+#include <linux/kthread.h>
+#include <linux/smpboot.h>
  
  #include "smpboot.h"
  
+#ifdef CONFIG_SMP
+
  #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
  /*
   * For the hotplug case we keep the task structs around and reuse
@@ -65,3 +73,228 @@ void __init idle_threads_init(void)
         }
  }
  #endif
+
+#endif /* #ifdef CONFIG_SMP */
+
+static LIST_HEAD(hotplug_threads);
+static DEFINE_MUTEX(smpboot_threads_lock);
+
+struct smpboot_thread_data {
+       unsigned int                    cpu;
+       unsigned int                    status;
+       struct smp_hotplug_thread       *ht;
+};
+
+enum {
+       HP_THREAD_NONE = 0,
+       HP_THREAD_ACTIVE,
+       HP_THREAD_PARKED,
+};
+
+/**
+ * smpboot_thread_fn - percpu hotplug thread loop function
+ * @data:      thread data pointer
+ *
+ * Checks for thread stop and park conditions. Calls the necessary
+ * setup, cleanup, park and unpark functions for the registered
+ * thread.
+ *
+ * Returns 1 when the thread should exit, 0 otherwise.
+ */
+static int smpboot_thread_fn(void *data)
+{
+       struct smpboot_thread_data *td = data;
+       struct smp_hotplug_thread *ht = td->ht;
+
+       while (1) {
+               set_current_state(TASK_INTERRUPTIBLE);
+               preempt_disable();
+               if (kthread_should_stop()) {
+                       set_current_state(TASK_RUNNING);
+                       preempt_enable();
+                       if (ht->cleanup)
+                               ht->cleanup(td->cpu, cpu_online(td->cpu));
+                       kfree(td);
+                       return 0;
+               }
+
+               if (kthread_should_park()) {
+                       __set_current_state(TASK_RUNNING);
+                       preempt_enable();
+                       if (ht->park && td->status == HP_THREAD_ACTIVE) {
+                               BUG_ON(td->cpu != smp_processor_id());
+                               ht->park(td->cpu);
+                               td->status = HP_THREAD_PARKED;
+                       }
+                       kthread_parkme();
+                       /* We might have been woken for stop */
+                       continue;
+               }
+
+               BUG_ON(td->cpu != smp_processor_id());
+
+               /* Check for state change setup */
+               switch (td->status) {
+               case HP_THREAD_NONE:
+                       preempt_enable();
+                       if (ht->setup)
+                               ht->setup(td->cpu);
+                       td->status = HP_THREAD_ACTIVE;
+                       preempt_disable();
+                       break;
+               case HP_THREAD_PARKED:
+                       preempt_enable();
+                       if (ht->unpark)
+                               ht->unpark(td->cpu);
+                       td->status = HP_THREAD_ACTIVE;
+                       preempt_disable();
+                       break;
+               }
+
+               if (!ht->thread_should_run(td->cpu)) {
+                       preempt_enable();
+                       schedule();
+               } else {
+                       set_current_state(TASK_RUNNING);
+                       preempt_enable();
+                       ht->thread_fn(td->cpu);
+               }
+       }
+}
+
+static int
+__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+       struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+       struct smpboot_thread_data *td;
+
+       if (tsk)
+               return 0;
+
+       td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
+       if (!td)
+               return -ENOMEM;
+       td->cpu = cpu;
+       td->ht = ht;
+
+       tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
+                                   ht->thread_comm);
+       if (IS_ERR(tsk)) {
+               kfree(td);
+               return PTR_ERR(tsk);
+       }
+
+       get_task_struct(tsk);
+       *per_cpu_ptr(ht->store, cpu) = tsk;
+       return 0;
+}
+
+int smpboot_create_threads(unsigned int cpu)
+{
+       struct smp_hotplug_thread *cur;
+       int ret = 0;
+
+       mutex_lock(&smpboot_threads_lock);
+       list_for_each_entry(cur, &hotplug_threads, list) {
+               ret = __smpboot_create_thread(cur, cpu);
+               if (ret)
+                       break;
+       }
+       mutex_unlock(&smpboot_threads_lock);
+       return ret;
+}
+
+static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+       struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+       kthread_unpark(tsk);
+}
+
+void smpboot_unpark_threads(unsigned int cpu)
+{
+       struct smp_hotplug_thread *cur;
+
+       mutex_lock(&smpboot_threads_lock);
+       list_for_each_entry(cur, &hotplug_threads, list)
+               smpboot_unpark_thread(cur, cpu);
+       mutex_unlock(&smpboot_threads_lock);
+}
+
+static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+       struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+       if (tsk)
+               kthread_park(tsk);
+}
+
+void smpboot_park_threads(unsigned int cpu)
+{
+       struct smp_hotplug_thread *cur;
+
+       mutex_lock(&smpboot_threads_lock);
+       list_for_each_entry_reverse(cur, &hotplug_threads, list)
+               smpboot_park_thread(cur, cpu);
+       mutex_unlock(&smpboot_threads_lock);
+}
+
+static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
+{
+       unsigned int cpu;
+
+       /* We need to destroy also the parked threads of offline cpus */
+       for_each_possible_cpu(cpu) {
+               struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+               if (tsk) {
+                       kthread_stop(tsk);
+                       put_task_struct(tsk);
+                       *per_cpu_ptr(ht->store, cpu) = NULL;
+               }
+       }
+}
+
+/**
+ * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * @plug_thread:       Hotplug thread descriptor
+ *
+ * Creates and starts the threads on all online cpus.
+ */
+int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+       unsigned int cpu;
+       int ret = 0;
+
+       mutex_lock(&smpboot_threads_lock);
+       for_each_online_cpu(cpu) {
+               ret = __smpboot_create_thread(plug_thread, cpu);
+               if (ret) {
+                       smpboot_destroy_threads(plug_thread);
+                       goto out;
+               }
+               smpboot_unpark_thread(plug_thread, cpu);
+       }
+       list_add(&plug_thread->list, &hotplug_threads);
+out:
+       mutex_unlock(&smpboot_threads_lock);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+
+/**
+ * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
+ * @plug_thread:       Hotplug thread descriptor
+ *
+ * Stops all threads on all possible cpus.
+ */
+void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+       get_online_cpus();
+       mutex_lock(&smpboot_threads_lock);
+       list_del(&plug_thread->list);
+       smpboot_destroy_threads(plug_thread);
+       mutex_unlock(&smpboot_threads_lock);
+       put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
diff --git a/kernel/smpboot.h b/kernel/smpboot.h

index 6ef9433e1c7001ff0d00799379de7e71ef1a0113..72415a0eb955cd84130d2753c7704e8af7f30960 100644 (file)
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -13,4 +13,8 @@ static inline void idle_thread_set_boot_cpu(void) { }
  static inline void idle_threads_init(void) { }
  #endif
  
+int smpboot_create_threads(unsigned int cpu);
+void smpboot_park_threads(unsigned int cpu);
+void smpboot_unpark_threads(unsigned int cpu);
+
  #endif
diff --git a/kernel/softirq.c b/kernel/softirq.c

index b73e681df09ea23e951b04672ca41227e9e0787f..5c6a5bd8462fb33fa8737d0ba1b6dbd7e2144dec 100644 (file)
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -23,6 +23,7 @@
  #include <linux/rcupdate.h>
  #include <linux/ftrace.h>
  #include <linux/smp.h>
+#include <linux/smpboot.h>
  #include <linux/tick.h>
  
  #define CREATE_TRACE_POINTS
@@ -742,49 +743,22 @@ void __init softirq_init(void)
         open_softirq(HI_SOFTIRQ, tasklet_hi_action);
  }
  
-static int run_ksoftirqd(void * __bind_cpu)
+static int ksoftirqd_should_run(unsigned int cpu)
  {
-       set_current_state(TASK_INTERRUPTIBLE);
-
-       while (!kthread_should_stop()) {
-               preempt_disable();
-               if (!local_softirq_pending()) {
-                       schedule_preempt_disabled();
-               }
-
-               __set_current_state(TASK_RUNNING);
-
-               while (local_softirq_pending()) {
-                       /* Preempt disable stops cpu going offline.
-                          If already offline, we'll be on wrong CPU:
-                          don't process */
-                       if (cpu_is_offline((long)__bind_cpu))
-                               goto wait_to_die;
-                       local_irq_disable();
-                       if (local_softirq_pending())
-                               __do_softirq();
-                       local_irq_enable();
-                       sched_preempt_enable_no_resched();
-                       cond_resched();
-                       preempt_disable();
-                       rcu_note_context_switch((long)__bind_cpu);
-               }
-               preempt_enable();
-               set_current_state(TASK_INTERRUPTIBLE);
-       }
-       __set_current_state(TASK_RUNNING);
-       return 0;
+       return local_softirq_pending();
+}
  
-wait_to_die:
-       preempt_enable();
-       /* Wait for kthread_stop */
-       set_current_state(TASK_INTERRUPTIBLE);
-       while (!kthread_should_stop()) {
-               schedule();
-               set_current_state(TASK_INTERRUPTIBLE);
+static void run_ksoftirqd(unsigned int cpu)
+{
+       local_irq_disable();
+       if (local_softirq_pending()) {
+               __do_softirq();
+               rcu_note_context_switch(cpu);
+               local_irq_enable();
+               cond_resched();
+               return;
         }
-       __set_current_state(TASK_RUNNING);
-       return 0;
+       local_irq_enable();
  }
  
  #ifdef CONFIG_HOTPLUG_CPU
@@ -850,50 +824,14 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
                                   unsigned long action,
                                   void *hcpu)
  {
-       int hotcpu = (unsigned long)hcpu;
-       struct task_struct *p;
-
         switch (action) {
-       case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
-               p = kthread_create_on_node(run_ksoftirqd,
-                                          hcpu,
-                                          cpu_to_node(hotcpu),
-                                          "ksoftirqd/%d", hotcpu);
-               if (IS_ERR(p)) {
-                       printk("ksoftirqd for %i failed\n", hotcpu);
-                       return notifier_from_errno(PTR_ERR(p));
-               }
-               kthread_bind(p, hotcpu);
-               per_cpu(ksoftirqd, hotcpu) = p;
-               break;
-       case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
-               wake_up_process(per_cpu(ksoftirqd, hotcpu));
-               break;
  #ifdef CONFIG_HOTPLUG_CPU
-       case CPU_UP_CANCELED:
-       case CPU_UP_CANCELED_FROZEN:
-               if (!per_cpu(ksoftirqd, hotcpu))
-                       break;
-               /* Unbind so it can run.  Fall thru. */
-               kthread_bind(per_cpu(ksoftirqd, hotcpu),
-                            cpumask_any(cpu_online_mask));
         case CPU_DEAD:
-       case CPU_DEAD_FROZEN: {
-               static const struct sched_param param = {
-                       .sched_priority = MAX_RT_PRIO-1
-               };
-
-               p = per_cpu(ksoftirqd, hotcpu);
-               per_cpu(ksoftirqd, hotcpu) = NULL;
-               sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
-               kthread_stop(p);
-               takeover_tasklets(hotcpu);
+       case CPU_DEAD_FROZEN:
+               takeover_tasklets((unsigned long)hcpu);
                 break;
-       }
  #endif /* CONFIG_HOTPLUG_CPU */
-       }
+       }
         return NOTIFY_OK;
  }
  
@@ -901,14 +839,19 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
         .notifier_call = cpu_callback
  };
  
+static struct smp_hotplug_thread softirq_threads = {
+       .store                  = &ksoftirqd,
+       .thread_should_run      = ksoftirqd_should_run,
+       .thread_fn              = run_ksoftirqd,
+       .thread_comm            = "ksoftirqd/%u",
+};
+
  static __init int spawn_ksoftirqd(void)
  {
-       void *cpu = (void *)(long)smp_processor_id();
-       int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
-
-       BUG_ON(err != NOTIFY_OK);
-       cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
         register_cpu_notifier(&cpu_nfb);
+
+       BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
+
         return 0;
  }
  early_initcall(spawn_ksoftirqd);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 87174ef59161eb32ce298f5ef9c2b0e9feb1b3f1..81c7b1a1a30745b9bf108b84c4ff116a483552e5 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -307,7 +307,7 @@ static struct ctl_table kern_table[] = {
                 .extra2         = &max_sched_tunable_scaling,
         },
         {
-               .procname       = "sched_migration_cost",
+               .procname       = "sched_migration_cost_ns",
                 .data           = &sysctl_sched_migration_cost,
                 .maxlen         = sizeof(unsigned int),
                 .mode           = 0644,
@@ -321,14 +321,14 @@ static struct ctl_table kern_table[] = {
                 .proc_handler   = proc_dointvec,
         },
         {
-               .procname       = "sched_time_avg",
+               .procname       = "sched_time_avg_ms",
                 .data           = &sysctl_sched_time_avg,
                 .maxlen         = sizeof(unsigned int),
                 .mode           = 0644,
                 .proc_handler   = proc_dointvec,
         },
         {
-               .procname       = "sched_shares_window",
+               .procname       = "sched_shares_window_ns",
                 .data           = &sysctl_sched_shares_window,
                 .maxlen         = sizeof(unsigned int),
                 .mode           = 0644,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c

index 024540f97f74c3e94205826f33d3968ea765f626..3a9e5d5c10916a7e67c131df489617a485a39bfc 100644 (file)
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -573,6 +573,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
         tick_do_update_jiffies64(now);
         update_cpu_load_nohz();
  
+       calc_load_exit_idle();
         touch_softlockup_watchdog();
         /*
          * Cancel the scheduled timer and restore the tick
diff --git a/kernel/timer.c b/kernel/timer.c

index 8c5e7b908c6814ed5342e63c65b69d4acb9ea037..d5de1b2292aad651da76bfbd2c5d037ff9852237 100644 (file)
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -92,24 +92,25 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
  /* Functions below help us manage 'deferrable' flag */
  static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
  {
-       return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
+       return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE);
  }
  
-static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
+static inline unsigned int tbase_get_irqsafe(struct tvec_base *base)
  {
-       return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
+       return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE);
  }
  
-static inline void timer_set_deferrable(struct timer_list *timer)
+static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
  {
-       timer->base = TBASE_MAKE_DEFERRED(timer->base);
+       return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK));
  }
  
  static inline void
  timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
  {
-       timer->base = (struct tvec_base *)((unsigned long)(new_base) |
-                                     tbase_get_deferrable(timer->base));
+       unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK;
+
+       timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags);
  }
  
  static unsigned long round_jiffies_common(unsigned long j, int cpu,
@@ -563,16 +564,14 @@ static inline void debug_timer_assert_init(struct timer_list *timer)
         debug_object_assert_init(timer, &timer_debug_descr);
  }
  
-static void __init_timer(struct timer_list *timer,
-                        const char *name,
-                        struct lock_class_key *key);
+static void do_init_timer(struct timer_list *timer, unsigned int flags,
+                         const char *name, struct lock_class_key *key);
  
-void init_timer_on_stack_key(struct timer_list *timer,
-                            const char *name,
-                            struct lock_class_key *key)
+void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags,
+                            const char *name, struct lock_class_key *key)
  {
         debug_object_init_on_stack(timer, &timer_debug_descr);
-       __init_timer(timer, name, key);
+       do_init_timer(timer, flags, name, key);
  }
  EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
  
@@ -613,12 +612,13 @@ static inline void debug_assert_init(struct timer_list *timer)
         debug_timer_assert_init(timer);
  }
  
-static void __init_timer(struct timer_list *timer,
-                        const char *name,
-                        struct lock_class_key *key)
+static void do_init_timer(struct timer_list *timer, unsigned int flags,
+                         const char *name, struct lock_class_key *key)
  {
+       struct tvec_base *base = __raw_get_cpu_var(tvec_bases);
+
         timer->entry.next = NULL;
-       timer->base = __raw_get_cpu_var(tvec_bases);
+       timer->base = (void *)((unsigned long)base | flags);
         timer->slack = -1;
  #ifdef CONFIG_TIMER_STATS
         timer->start_site = NULL;
@@ -628,22 +628,10 @@ static void __init_timer(struct timer_list *timer,
         lockdep_init_map(&timer->lockdep_map, name, key, 0);
  }
  
-void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
-                                        const char *name,
-                                        struct lock_class_key *key,
-                                        void (*function)(unsigned long),
-                                        unsigned long data)
-{
-       timer->function = function;
-       timer->data = data;
-       init_timer_on_stack_key(timer, name, key);
-       timer_set_deferrable(timer);
-}
-EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
-
  /**
   * init_timer_key - initialize a timer
   * @timer: the timer to be initialized
+ * @flags: timer flags
   * @name: name of the timer
   * @key: lockdep class key of the fake lock used for tracking timer
   *       sync lock dependencies
@@ -651,24 +639,14 @@ EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
   * init_timer_key() must be done to a timer prior calling *any* of the
   * other timer functions.
   */
-void init_timer_key(struct timer_list *timer,
-                   const char *name,
-                   struct lock_class_key *key)
+void init_timer_key(struct timer_list *timer, unsigned int flags,
+                   const char *name, struct lock_class_key *key)
  {
         debug_init(timer);
-       __init_timer(timer, name, key);
+       do_init_timer(timer, flags, name, key);
  }
  EXPORT_SYMBOL(init_timer_key);
  
-void init_timer_deferrable_key(struct timer_list *timer,
-                              const char *name,
-                              struct lock_class_key *key)
-{
-       init_timer_key(timer, name, key);
-       timer_set_deferrable(timer);
-}
-EXPORT_SYMBOL(init_timer_deferrable_key);
-
  static inline void detach_timer(struct timer_list *timer, bool clear_pending)
  {
         struct list_head *entry = &timer->entry;
@@ -686,7 +664,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
  {
         detach_timer(timer, true);
         if (!tbase_get_deferrable(timer->base))
-               timer->base->active_timers--;
+               base->active_timers--;
  }
  
  static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
@@ -697,7 +675,7 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
  
         detach_timer(timer, clear_pending);
         if (!tbase_get_deferrable(timer->base)) {
-               timer->base->active_timers--;
+               base->active_timers--;
                 if (timer->expires == base->next_timer)
                         base->next_timer = base->timer_jiffies;
         }
@@ -1029,14 +1007,14 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
   *
   * Synchronization rules: Callers must prevent restarting of the timer,
   * otherwise this function is meaningless. It must not be called from
- * interrupt contexts. The caller must not hold locks which would prevent
- * completion of the timer's handler. The timer's handler must not call
- * add_timer_on(). Upon exit the timer is not queued and the handler is
- * not running on any CPU.
+ * interrupt contexts unless the timer is an irqsafe one. The caller must
+ * not hold locks which would prevent completion of the timer's
+ * handler. The timer's handler must not call add_timer_on(). Upon exit the
+ * timer is not queued and the handler is not running on any CPU.
   *
- * Note: You must not hold locks that are held in interrupt context
- *   while calling this function. Even if the lock has nothing to do
- *   with the timer in question.  Here's why:
+ * Note: For !irqsafe timers, you must not hold locks that are held in
+ *   interrupt context while calling this function. Even if the lock has
+ *   nothing to do with the timer in question.  Here's why:
   *
   *    CPU0                             CPU1
   *    ----                             ----
@@ -1073,7 +1051,7 @@ int del_timer_sync(struct timer_list *timer)
          * don't use it in hardirq context, because it
          * could lead to deadlock.
          */
-       WARN_ON(in_irq());
+       WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base));
         for (;;) {
                 int ret = try_to_del_timer_sync(timer);
                 if (ret >= 0)
@@ -1180,19 +1158,27 @@ static inline void __run_timers(struct tvec_base *base)
                 while (!list_empty(head)) {
                         void (*fn)(unsigned long);
                         unsigned long data;
+                       bool irqsafe;
  
                         timer = list_first_entry(head, struct timer_list,entry);
                         fn = timer->function;
                         data = timer->data;
+                       irqsafe = tbase_get_irqsafe(timer->base);
  
                         timer_stats_account_timer(timer);
  
                         base->running_timer = timer;
                         detach_expired_timer(timer, base);
  
-                       spin_unlock_irq(&base->lock);
-                       call_timer_fn(timer, fn, data);
-                       spin_lock_irq(&base->lock);
+                       if (irqsafe) {
+                               spin_unlock(&base->lock);
+                               call_timer_fn(timer, fn, data);
+                               spin_lock(&base->lock);
+                       } else {
+                               spin_unlock_irq(&base->lock);
+                               call_timer_fn(timer, fn, data);
+                               spin_lock_irq(&base->lock);
+                       }
                 }
         }
         base->running_timer = NULL;
@@ -1791,9 +1777,13 @@ static struct notifier_block __cpuinitdata timers_nb = {
  
  void __init init_timers(void)
  {
-       int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
-                               (void *)(long)smp_processor_id());
+       int err;
+
+       /* ensure there are enough low bits for flags in timer->base pointer */
+       BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
  
+       err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
+                              (void *)(long)smp_processor_id());
         init_timer_stats();
  
         BUG_ON(err != NOTIFY_OK);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig

index 8c4c07071cc5c3bfd014036689e450529b52dc29..9301a0e35e0cf857af0664cfc16951d8125e0b0e 100644 (file)
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS
         help
           See Documentation/trace/ftrace-design.txt
  
+config HAVE_FENTRY
+       bool
+       help
+         Arch supports the gcc options -pg with -mfentry
+
  config HAVE_C_RECORDMCOUNT
         bool
         help
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile

index b831087c8200c9548bb5334ea7a2cbd301410ac4..837090808aac03c3632a00e586e6a2a20b74a76a 100644 (file)
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -5,10 +5,12 @@ ifdef CONFIG_FUNCTION_TRACER
  ORIG_CFLAGS := $(KBUILD_CFLAGS)
  KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
  
+ifdef CONFIG_FTRACE_SELFTEST
  # selftest needs instrumentation
  CFLAGS_trace_selftest_dynamic.o = -pg
  obj-y += trace_selftest_dynamic.o
  endif
+endif
  
  # If unlikely tracing is enabled, do not trace these files
  ifdef CONFIG_TRACING_BRANCHES
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c

index b4f20fba09fcc77dc571bdf718bfd04adfb29897..9dcf15d38380356a2288ec97b098c002d9bca01d 100644 (file)
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -64,12 +64,20 @@
  
  #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)
  
+static struct ftrace_ops ftrace_list_end __read_mostly = {
+       .func           = ftrace_stub,
+       .flags          = FTRACE_OPS_FL_RECURSION_SAFE,
+};
+
  /* ftrace_enabled is a method to turn ftrace on or off */
  int ftrace_enabled __read_mostly;
  static int last_ftrace_enabled;
  
  /* Quick disabling of function tracer. */
-int function_trace_stop;
+int function_trace_stop __read_mostly;
+
+/* Current function tracing op */
+struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
  
  /* List for set_ftrace_pid's pids. */
  LIST_HEAD(ftrace_pids);
@@ -86,22 +94,43 @@ static int ftrace_disabled __read_mostly;
  
  static DEFINE_MUTEX(ftrace_lock);
  
-static struct ftrace_ops ftrace_list_end __read_mostly = {
-       .func           = ftrace_stub,
-};
-
  static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
  static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
  static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
  ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
-static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub;
-ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
  ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
  static struct ftrace_ops global_ops;
  static struct ftrace_ops control_ops;
  
-static void
-ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
+#if ARCH_SUPPORTS_FTRACE_OPS
+static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
+                                struct ftrace_ops *op, struct pt_regs *regs);
+#else
+/* See comment below, where ftrace_ops_list_func is defined */
+static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
+#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
+#endif
+
+/**
+ * ftrace_nr_registered_ops - return number of ops registered
+ *
+ * Returns the number of ftrace_ops registered and tracing functions
+ */
+int ftrace_nr_registered_ops(void)
+{
+       struct ftrace_ops *ops;
+       int cnt = 0;
+
+       mutex_lock(&ftrace_lock);
+
+       for (ops = ftrace_ops_list;
+            ops != &ftrace_list_end; ops = ops->next)
+               cnt++;
+
+       mutex_unlock(&ftrace_lock);
+
+       return cnt;
+}
  
  /*
   * Traverse the ftrace_global_list, invoking all entries.  The reason that we
@@ -112,29 +141,29 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
   *
   * Silly Alpha and silly pointer-speculation compiler optimizations!
   */
-static void ftrace_global_list_func(unsigned long ip,
-                                   unsigned long parent_ip)
+static void
+ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
+                       struct ftrace_ops *op, struct pt_regs *regs)
  {
-       struct ftrace_ops *op;
-
         if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
                 return;
  
         trace_recursion_set(TRACE_GLOBAL_BIT);
         op = rcu_dereference_raw(ftrace_global_list); /*see above*/
         while (op != &ftrace_list_end) {
-               op->func(ip, parent_ip);
+               op->func(ip, parent_ip, op, regs);
                 op = rcu_dereference_raw(op->next); /*see above*/
         };
         trace_recursion_clear(TRACE_GLOBAL_BIT);
  }
  
-static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
+static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
+                           struct ftrace_ops *op, struct pt_regs *regs)
  {
         if (!test_tsk_trace_trace(current))
                 return;
  
-       ftrace_pid_function(ip, parent_ip);
+       ftrace_pid_function(ip, parent_ip, op, regs);
  }
  
  static void set_ftrace_pid_function(ftrace_func_t func)
@@ -153,25 +182,9 @@ static void set_ftrace_pid_function(ftrace_func_t func)
  void clear_ftrace_function(void)
  {
         ftrace_trace_function = ftrace_stub;
-       __ftrace_trace_function = ftrace_stub;
-       __ftrace_trace_function_delay = ftrace_stub;
         ftrace_pid_function = ftrace_stub;
  }
  
-#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
-/*
- * For those archs that do not test ftrace_trace_stop in their
- * mcount call site, we need to do it from C.
- */
-static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
-{
-       if (function_trace_stop)
-               return;
-
-       __ftrace_trace_function(ip, parent_ip);
-}
-#endif
-
  static void control_ops_disable_all(struct ftrace_ops *ops)
  {
         int cpu;
@@ -230,28 +243,27 @@ static void update_ftrace_function(void)
  
         /*
          * If we are at the end of the list and this ops is
-        * not dynamic, then have the mcount trampoline call
-        * the function directly
+        * recursion safe and not dynamic and the arch supports passing ops,
+        * then have the mcount trampoline call the function directly.
          */
         if (ftrace_ops_list == &ftrace_list_end ||
             (ftrace_ops_list->next == &ftrace_list_end &&
-            !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC)))
+            !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) &&
+            (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
+            !FTRACE_FORCE_LIST_FUNC)) {
+               /* Set the ftrace_ops that the arch callback uses */
+               if (ftrace_ops_list == &global_ops)
+                       function_trace_op = ftrace_global_list;
+               else
+                       function_trace_op = ftrace_ops_list;
                 func = ftrace_ops_list->func;
-       else
+       } else {
+               /* Just use the default ftrace_ops */
+               function_trace_op = &ftrace_list_end;
                 func = ftrace_ops_list_func;
+       }
  
-#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
         ftrace_trace_function = func;
-#else
-#ifdef CONFIG_DYNAMIC_FTRACE
-       /* do not update till all functions have been modified */
-       __ftrace_trace_function_delay = func;
-#else
-       __ftrace_trace_function = func;
-#endif
-       ftrace_trace_function =
-               (func == ftrace_stub) ? func : ftrace_test_stop_func;
-#endif
  }
  
  static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
@@ -325,6 +337,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
         if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
                 return -EINVAL;
  
+#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS
+       /*
+        * If the ftrace_ops specifies SAVE_REGS, then it only can be used
+        * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.
+        * Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant.
+        */
+       if (ops->flags & FTRACE_OPS_FL_SAVE_REGS &&
+           !(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED))
+               return -EINVAL;
+
+       if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)
+               ops->flags |= FTRACE_OPS_FL_SAVE_REGS;
+#endif
+
         if (!core_kernel_data((unsigned long)ops))
                 ops->flags |= FTRACE_OPS_FL_DYNAMIC;
  
@@ -773,7 +799,8 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
  }
  
  static void
-function_profile_call(unsigned long ip, unsigned long parent_ip)
+function_profile_call(unsigned long ip, unsigned long parent_ip,
+                     struct ftrace_ops *ops, struct pt_regs *regs)
  {
         struct ftrace_profile_stat *stat;
         struct ftrace_profile *rec;
@@ -803,7 +830,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)
  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
  static int profile_graph_entry(struct ftrace_graph_ent *trace)
  {
-       function_profile_call(trace->func, 0);
+       function_profile_call(trace->func, 0, NULL, NULL);
         return 1;
  }
  
@@ -863,6 +890,7 @@ static void unregister_ftrace_profiler(void)
  #else
  static struct ftrace_ops ftrace_profile_ops __read_mostly = {
         .func           = function_profile_call,
+       .flags          = FTRACE_OPS_FL_RECURSION_SAFE,
  };
  
  static int register_ftrace_profiler(void)
@@ -1045,6 +1073,7 @@ static struct ftrace_ops global_ops = {
         .func                   = ftrace_stub,
         .notrace_hash           = EMPTY_HASH,
         .filter_hash            = EMPTY_HASH,
+       .flags                  = FTRACE_OPS_FL_RECURSION_SAFE,
  };
  
  static DEFINE_MUTEX(ftrace_regex_lock);
@@ -1525,6 +1554,12 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
                         rec->flags++;
                         if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
                                 return;
+                       /*
+                        * If any ops wants regs saved for this function
+                        * then all ops will get saved regs.
+                        */
+                       if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+                               rec->flags |= FTRACE_FL_REGS;
                 } else {
                         if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
                                 return;
@@ -1616,18 +1651,59 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
         if (enable && (rec->flags & ~FTRACE_FL_MASK))
                 flag = FTRACE_FL_ENABLED;
  
+       /*
+        * If enabling and the REGS flag does not match the REGS_EN, then
+        * do not ignore this record. Set flags to fail the compare against
+        * ENABLED.
+        */
+       if (flag &&
+           (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN)))
+               flag |= FTRACE_FL_REGS;
+
         /* If the state of this record hasn't changed, then do nothing */
         if ((rec->flags & FTRACE_FL_ENABLED) == flag)
                 return FTRACE_UPDATE_IGNORE;
  
         if (flag) {
-               if (update)
+               /* Save off if rec is being enabled (for return value) */
+               flag ^= rec->flags & FTRACE_FL_ENABLED;
+
+               if (update) {
                         rec->flags |= FTRACE_FL_ENABLED;
-               return FTRACE_UPDATE_MAKE_CALL;
+                       if (flag & FTRACE_FL_REGS) {
+                               if (rec->flags & FTRACE_FL_REGS)
+                                       rec->flags |= FTRACE_FL_REGS_EN;
+                               else
+                                       rec->flags &= ~FTRACE_FL_REGS_EN;
+                       }
+               }
+
+               /*
+                * If this record is being updated from a nop, then
+                *   return UPDATE_MAKE_CALL.
+                * Otherwise, if the EN flag is set, then return
+                *   UPDATE_MODIFY_CALL_REGS to tell the caller to convert
+                *   from the non-save regs, to a save regs function.
+                * Otherwise,
+                *   return UPDATE_MODIFY_CALL to tell the caller to convert
+                *   from the save regs, to a non-save regs function.
+                */
+               if (flag & FTRACE_FL_ENABLED)
+                       return FTRACE_UPDATE_MAKE_CALL;
+               else if (rec->flags & FTRACE_FL_REGS_EN)
+                       return FTRACE_UPDATE_MODIFY_CALL_REGS;
+               else
+                       return FTRACE_UPDATE_MODIFY_CALL;
         }
  
-       if (update)
-               rec->flags &= ~FTRACE_FL_ENABLED;
+       if (update) {
+               /* If there's no more users, clear all flags */
+               if (!(rec->flags & ~FTRACE_FL_MASK))
+                       rec->flags = 0;
+               else
+                       /* Just disable the record (keep REGS state) */
+                       rec->flags &= ~FTRACE_FL_ENABLED;
+       }
  
         return FTRACE_UPDATE_MAKE_NOP;
  }
@@ -1662,13 +1738,17 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
  static int
  __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
  {
+       unsigned long ftrace_old_addr;
         unsigned long ftrace_addr;
         int ret;
  
-       ftrace_addr = (unsigned long)FTRACE_ADDR;
-
         ret = ftrace_update_record(rec, enable);
  
+       if (rec->flags & FTRACE_FL_REGS)
+               ftrace_addr = (unsigned long)FTRACE_REGS_ADDR;
+       else
+               ftrace_addr = (unsigned long)FTRACE_ADDR;
+
         switch (ret) {
         case FTRACE_UPDATE_IGNORE:
                 return 0;
@@ -1678,6 +1758,15 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
  
         case FTRACE_UPDATE_MAKE_NOP:
                 return ftrace_make_nop(NULL, rec, ftrace_addr);
+
+       case FTRACE_UPDATE_MODIFY_CALL_REGS:
+       case FTRACE_UPDATE_MODIFY_CALL:
+               if (rec->flags & FTRACE_FL_REGS)
+                       ftrace_old_addr = (unsigned long)FTRACE_ADDR;
+               else
+                       ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR;
+
+               return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
         }
  
         return -1; /* unknow ftrace bug */
@@ -1882,16 +1971,6 @@ static void ftrace_run_update_code(int command)
          */
         arch_ftrace_update_code(command);
  
-#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
-       /*
-        * For archs that call ftrace_test_stop_func(), we must
-        * wait till after we update all the function callers
-        * before we update the callback. This keeps different
-        * ops that record different functions from corrupting
-        * each other.
-        */
-       __ftrace_trace_function = __ftrace_trace_function_delay;
-#endif
         function_trace_stop--;
  
         ret = ftrace_arch_code_modify_post_process();
@@ -2441,8 +2520,9 @@ static int t_show(struct seq_file *m, void *v)
  
         seq_printf(m, "%ps", (void *)rec->ip);
         if (iter->flags & FTRACE_ITER_ENABLED)
-               seq_printf(m, " (%ld)",
-                          rec->flags & ~FTRACE_FL_MASK);
+               seq_printf(m, " (%ld)%s",
+                          rec->flags & ~FTRACE_FL_MASK,
+                          rec->flags & FTRACE_FL_REGS ? " R" : "");
         seq_printf(m, "\n");
  
         return 0;
@@ -2790,8 +2870,8 @@ static int __init ftrace_mod_cmd_init(void)
  }
  device_initcall(ftrace_mod_cmd_init);
  
-static void
-function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
+static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
+                                     struct ftrace_ops *op, struct pt_regs *pt_regs)
  {
         struct ftrace_func_probe *entry;
         struct hlist_head *hhd;
@@ -3162,8 +3242,27 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
  }
  
  static int
-ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
-                int reset, int enable)
+ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
+{
+       struct ftrace_func_entry *entry;
+
+       if (!ftrace_location(ip))
+               return -EINVAL;
+
+       if (remove) {
+               entry = ftrace_lookup_ip(hash, ip);
+               if (!entry)
+                       return -ENOENT;
+               free_hash_entry(hash, entry);
+               return 0;
+       }
+
+       return add_hash_entry(hash, ip);
+}
+
+static int
+ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
+               unsigned long ip, int remove, int reset, int enable)
  {
         struct ftrace_hash **orig_hash;
         struct ftrace_hash *hash;
@@ -3192,6 +3291,11 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
                 ret = -EINVAL;
                 goto out_regex_unlock;
         }
+       if (ip) {
+               ret = ftrace_match_addr(hash, ip, remove);
+               if (ret < 0)
+                       goto out_regex_unlock;
+       }
  
         mutex_lock(&ftrace_lock);
         ret = ftrace_hash_move(ops, enable, orig_hash, hash);
@@ -3208,6 +3312,37 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
         return ret;
  }
  
+static int
+ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
+               int reset, int enable)
+{
+       return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable);
+}
+
+/**
+ * ftrace_set_filter_ip - set a function to filter on in ftrace by address
+ * @ops - the ops to set the filter with
+ * @ip - the address to add to or remove from the filter.
+ * @remove - non zero to remove the ip from the filter
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled
+ * If @ip is NULL, it failes to update filter.
+ */
+int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
+                        int remove, int reset)
+{
+       return ftrace_set_addr(ops, ip, remove, reset, 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
+
+static int
+ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
+                int reset, int enable)
+{
+       return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable);
+}
+
  /**
   * ftrace_set_filter - set a function to filter on in ftrace
   * @ops - the ops to set the filter with
@@ -3912,6 +4047,7 @@ void __init ftrace_init(void)
  
  static struct ftrace_ops global_ops = {
         .func                   = ftrace_stub,
+       .flags                  = FTRACE_OPS_FL_RECURSION_SAFE,
  };
  
  static int __init ftrace_nodyn_init(void)
@@ -3942,10 +4078,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
  #endif /* CONFIG_DYNAMIC_FTRACE */
  
  static void
-ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip)
+ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
+                       struct ftrace_ops *op, struct pt_regs *regs)
  {
-       struct ftrace_ops *op;
-
         if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))
                 return;
  
@@ -3959,7 +4094,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip)
         while (op != &ftrace_list_end) {
                 if (!ftrace_function_local_disabled(op) &&
                     ftrace_ops_test(op, ip))
-                       op->func(ip, parent_ip);
+                       op->func(ip, parent_ip, op, regs);
  
                 op = rcu_dereference_raw(op->next);
         };
@@ -3969,13 +4104,18 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip)
  
  static struct ftrace_ops control_ops = {
         .func = ftrace_ops_control_func,
+       .flags = FTRACE_OPS_FL_RECURSION_SAFE,
  };
  
-static void
-ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
+static inline void
+__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
+                      struct ftrace_ops *ignored, struct pt_regs *regs)
  {
         struct ftrace_ops *op;
  
+       if (function_trace_stop)
+               return;
+
         if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
                 return;
  
@@ -3988,13 +4128,39 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
         op = rcu_dereference_raw(ftrace_ops_list);
         while (op != &ftrace_list_end) {
                 if (ftrace_ops_test(op, ip))
-                       op->func(ip, parent_ip);
+                       op->func(ip, parent_ip, op, regs);
                 op = rcu_dereference_raw(op->next);
         };
         preempt_enable_notrace();
         trace_recursion_clear(TRACE_INTERNAL_BIT);
  }
  
+/*
+ * Some archs only support passing ip and parent_ip. Even though
+ * the list function ignores the op parameter, we do not want any
+ * C side effects, where a function is called without the caller
+ * sending a third parameter.
+ * Archs are to support both the regs and ftrace_ops at the same time.
+ * If they support ftrace_ops, it is assumed they support regs.
+ * If call backs want to use regs, they must either check for regs
+ * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS.
+ * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved.
+ * An architecture can pass partial regs with ftrace_ops and still
+ * set the ARCH_SUPPORT_FTARCE_OPS.
+ */
+#if ARCH_SUPPORTS_FTRACE_OPS
+static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
+                                struct ftrace_ops *op, struct pt_regs *regs)
+{
+       __ftrace_ops_list_func(ip, parent_ip, NULL, regs);
+}
+#else
+static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
+{
+       __ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
+}
+#endif
+
  static void clear_ftrace_swapper(void)
  {
         struct task_struct *p;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c

index 49491fa7daa2d35546a6e5ba46d73103594c384e..b32ed0e385a59b40b971bd04e0e374d333992749 100644 (file)
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2816,7 +2816,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
   * to the buffer after this will fail and return NULL.
   *
   * This is different than ring_buffer_record_disable() as
- * it works like an on/off switch, where as the disable() verison
+ * it works like an on/off switch, where as the disable() version
   * must be paired with a enable().
   */
  void ring_buffer_record_off(struct ring_buffer *buffer)
@@ -2839,7 +2839,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_off);
   * ring_buffer_record_off().
   *
   * This is different than ring_buffer_record_enable() as
- * it works like an on/off switch, where as the enable() verison
+ * it works like an on/off switch, where as the enable() version
   * must be paired with a disable().
   */
  void ring_buffer_record_on(struct ring_buffer *buffer)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c

index adde0994911e537b821a65df64b9dd8920304981..883968ccfead0610043de6592d0796fdea6d4d18 100644 (file)
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -426,15 +426,15 @@ __setup("trace_buf_size=", set_buf_size);
  
  static int __init set_tracing_thresh(char *str)
  {
-       unsigned long threshhold;
+       unsigned long threshold;
         int ret;
  
         if (!str)
                 return 0;
-       ret = strict_strtoul(str, 0, &threshhold);
+       ret = strict_strtoul(str, 0, &threshold);
         if (ret < 0)
                 return 0;
-       tracing_thresh = threshhold * 1000;
+       tracing_thresh = threshold * 1000;
         return 1;
  }
  __setup("tracing_thresh=", set_tracing_thresh);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h

index 55e1f7f0db126edf4bbcbe07245a9558b1030bec..593debefc4e9218362840f015f5038f861379626 100644 (file)
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -472,11 +472,11 @@ extern void trace_find_cmdline(int pid, char comm[]);
  
  #ifdef CONFIG_DYNAMIC_FTRACE
  extern unsigned long ftrace_update_tot_cnt;
+#endif
  #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
  extern int DYN_FTRACE_TEST_NAME(void);
  #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
  extern int DYN_FTRACE_TEST_NAME2(void);
-#endif
  
  extern int ring_buffer_expanded;
  extern bool tracing_selftest_disabled;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c

index 8a6d2ee2086c2cd721dc29ae92d23082370e10db..84b1e045faba836583fb371d285f49e79bfa6bd0 100644 (file)
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -258,7 +258,8 @@ EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
  
  #ifdef CONFIG_FUNCTION_TRACER
  static void
-perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip)
+perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
+                         struct ftrace_ops *ops, struct pt_regs *pt_regs)
  {
         struct ftrace_entry *entry;
         struct hlist_head *head;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c

index 29111da1d1006c1f7b57a939b54517d17f315520..6825d833a25711ea49487e673c17a1c6512f4bf4 100644 (file)
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1681,7 +1681,8 @@ static __init void event_trace_self_tests(void)
  static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
  
  static void
-function_test_events_call(unsigned long ip, unsigned long parent_ip)
+function_test_events_call(unsigned long ip, unsigned long parent_ip,
+                         struct ftrace_ops *op, struct pt_regs *pt_regs)
  {
         struct ring_buffer_event *event;
         struct ring_buffer *buffer;
@@ -1720,6 +1721,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
  static struct ftrace_ops trace_ops __initdata  =
  {
         .func = function_test_events_call,
+       .flags = FTRACE_OPS_FL_RECURSION_SAFE,
  };
  
  static __init void event_trace_self_test_with_function(void)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c

index 431dba8b754214ee06b4d7163dc3abd91b2718b5..c154797a7ff7afa80f5d79571eae87fb609eec19 100644 (file)
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -2002,7 +2002,7 @@ static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter,
  static int __ftrace_function_set_filter(int filter, char *buf, int len,
                                         struct function_filter_data *data)
  {
-       int i, re_cnt, ret;
+       int i, re_cnt, ret = -EINVAL;
         int *reset;
         char **re;
  
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c

index a426f410c06053a5191f8f634138b07f049b6c38..483162a9f9080258fa08b76247e40103d8e5a6c5 100644 (file)
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -49,7 +49,8 @@ static void function_trace_start(struct trace_array *tr)
  }
  
  static void
-function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
+function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
+                                struct ftrace_ops *op, struct pt_regs *pt_regs)
  {
         struct trace_array *tr = func_trace;
         struct trace_array_cpu *data;
@@ -84,7 +85,9 @@ enum {
  static struct tracer_flags func_flags;
  
  static void
-function_trace_call(unsigned long ip, unsigned long parent_ip)
+function_trace_call(unsigned long ip, unsigned long parent_ip,
+                   struct ftrace_ops *op, struct pt_regs *pt_regs)
+
  {
         struct trace_array *tr = func_trace;
         struct trace_array_cpu *data;
@@ -121,7 +124,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
  }
  
  static void
-function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
+function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
+                         struct ftrace_ops *op, struct pt_regs *pt_regs)
  {
         struct trace_array *tr = func_trace;
         struct trace_array_cpu *data;
@@ -164,13 +168,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
  static struct ftrace_ops trace_ops __read_mostly =
  {
         .func = function_trace_call,
-       .flags = FTRACE_OPS_FL_GLOBAL,
+       .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
  };
  
  static struct ftrace_ops trace_stack_ops __read_mostly =
  {
         .func = function_stack_trace_call,
-       .flags = FTRACE_OPS_FL_GLOBAL,
+       .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
  };
  
  static struct tracer_opt func_opts[] = {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c

index ce27c8ba8d318ffa6337f231e12a680f6deb7720..99b4378393d5fae8ea1c8de91032129f52a45216 100644 (file)
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -143,7 +143,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
                 return;
         }
  
-#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
+#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY)
         /*
          * The arch may choose to record the frame pointer used
          * and check it here to make sure that it is what we expect it
@@ -154,6 +154,9 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
          *
          * Currently, x86_32 with optimize for size (-Os) makes the latest
          * gcc do the above.
+        *
+        * Note, -mfentry does not use frame pointers, and this test
+        *  is not needed if CC_USING_FENTRY is set.
          */
         if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
                 ftrace_graph_stop();
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c

index 99d20e9203686420e74af5c2eca6387deb375028..d98ee8283b29a1e204a626f7a311ae2ece2ac971 100644 (file)
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -136,7 +136,8 @@ static int func_prolog_dec(struct trace_array *tr,
   * irqsoff uses its own tracer function to keep the overhead down:
   */
  static void
-irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
+                   struct ftrace_ops *op, struct pt_regs *pt_regs)
  {
         struct trace_array *tr = irqsoff_trace;
         struct trace_array_cpu *data;
@@ -153,7 +154,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
  static struct ftrace_ops trace_ops __read_mostly =
  {
         .func = irqsoff_tracer_call,
-       .flags = FTRACE_OPS_FL_GLOBAL,
+       .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
  };
  #endif /* CONFIG_FUNCTION_TRACER */
  
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c

index ff791ea48b570b9f8afe00624d622dd4f6af47e7..02170c00c413731c91e2f76f6d82f3a09582e053 100644 (file)
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -108,7 +108,8 @@ out_enable:
   * wakeup uses its own tracer function to keep the overhead down:
   */
  static void
-wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
+                  struct ftrace_ops *op, struct pt_regs *pt_regs)
  {
         struct trace_array *tr = wakeup_trace;
         struct trace_array_cpu *data;
@@ -129,7 +130,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
  static struct ftrace_ops trace_ops __read_mostly =
  {
         .func = wakeup_tracer_call,
-       .flags = FTRACE_OPS_FL_GLOBAL,
+       .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
  };
  #endif /* CONFIG_FUNCTION_TRACER */
  
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c

index 288541f977fb7f3b41e747a6b7c2de9e4ea0e1e9..2c00a691a54068ec9db750f2be4053701bd5be65 100644 (file)
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -103,54 +103,67 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
  
  static int trace_selftest_test_probe1_cnt;
  static void trace_selftest_test_probe1_func(unsigned long ip,
-                                           unsigned long pip)
+                                           unsigned long pip,
+                                           struct ftrace_ops *op,
+                                           struct pt_regs *pt_regs)
  {
         trace_selftest_test_probe1_cnt++;
  }
  
  static int trace_selftest_test_probe2_cnt;
  static void trace_selftest_test_probe2_func(unsigned long ip,
-                                           unsigned long pip)
+                                           unsigned long pip,
+                                           struct ftrace_ops *op,
+                                           struct pt_regs *pt_regs)
  {
         trace_selftest_test_probe2_cnt++;
  }
  
  static int trace_selftest_test_probe3_cnt;
  static void trace_selftest_test_probe3_func(unsigned long ip,
-                                           unsigned long pip)
+                                           unsigned long pip,
+                                           struct ftrace_ops *op,
+                                           struct pt_regs *pt_regs)
  {
         trace_selftest_test_probe3_cnt++;
  }
  
  static int trace_selftest_test_global_cnt;
  static void trace_selftest_test_global_func(unsigned long ip,
-                                           unsigned long pip)
+                                           unsigned long pip,
+                                           struct ftrace_ops *op,
+                                           struct pt_regs *pt_regs)
  {
         trace_selftest_test_global_cnt++;
  }
  
  static int trace_selftest_test_dyn_cnt;
  static void trace_selftest_test_dyn_func(unsigned long ip,
-                                        unsigned long pip)
+                                        unsigned long pip,
+                                        struct ftrace_ops *op,
+                                        struct pt_regs *pt_regs)
  {
         trace_selftest_test_dyn_cnt++;
  }
  
  static struct ftrace_ops test_probe1 = {
         .func                   = trace_selftest_test_probe1_func,
+       .flags                  = FTRACE_OPS_FL_RECURSION_SAFE,
  };
  
  static struct ftrace_ops test_probe2 = {
         .func                   = trace_selftest_test_probe2_func,
+       .flags                  = FTRACE_OPS_FL_RECURSION_SAFE,
  };
  
  static struct ftrace_ops test_probe3 = {
         .func                   = trace_selftest_test_probe3_func,
+       .flags                  = FTRACE_OPS_FL_RECURSION_SAFE,
  };
  
  static struct ftrace_ops test_global = {
-       .func                   = trace_selftest_test_global_func,
-       .flags                  = FTRACE_OPS_FL_GLOBAL,
+       .func           = trace_selftest_test_global_func,
+       .flags          = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
  };
  
  static void print_counts(void)
@@ -393,10 +406,253 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
  
         return ret;
  }
+
+static int trace_selftest_recursion_cnt;
+static void trace_selftest_test_recursion_func(unsigned long ip,
+                                              unsigned long pip,
+                                              struct ftrace_ops *op,
+                                              struct pt_regs *pt_regs)
+{
+       /*
+        * This function is registered without the recursion safe flag.
+        * The ftrace infrastructure should provide the recursion
+        * protection. If not, this will crash the kernel!
+        */
+       trace_selftest_recursion_cnt++;
+       DYN_FTRACE_TEST_NAME();
+}
+
+static void trace_selftest_test_recursion_safe_func(unsigned long ip,
+                                                   unsigned long pip,
+                                                   struct ftrace_ops *op,
+                                                   struct pt_regs *pt_regs)
+{
+       /*
+        * We said we would provide our own recursion. By calling
+        * this function again, we should recurse back into this function
+        * and count again. But this only happens if the arch supports
+        * all of ftrace features and nothing else is using the function
+        * tracing utility.
+        */
+       if (trace_selftest_recursion_cnt++)
+               return;
+       DYN_FTRACE_TEST_NAME();
+}
+
+static struct ftrace_ops test_rec_probe = {
+       .func                   = trace_selftest_test_recursion_func,
+};
+
+static struct ftrace_ops test_recsafe_probe = {
+       .func                   = trace_selftest_test_recursion_safe_func,
+       .flags                  = FTRACE_OPS_FL_RECURSION_SAFE,
+};
+
+static int
+trace_selftest_function_recursion(void)
+{
+       int save_ftrace_enabled = ftrace_enabled;
+       int save_tracer_enabled = tracer_enabled;
+       char *func_name;
+       int len;
+       int ret;
+       int cnt;
+
+       /* The previous test PASSED */
+       pr_cont("PASSED\n");
+       pr_info("Testing ftrace recursion: ");
+
+
+       /* enable tracing, and record the filter function */
+       ftrace_enabled = 1;
+       tracer_enabled = 1;
+
+       /* Handle PPC64 '.' name */
+       func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+       len = strlen(func_name);
+
+       ret = ftrace_set_filter(&test_rec_probe, func_name, len, 1);
+       if (ret) {
+               pr_cont("*Could not set filter* ");
+               goto out;
+       }
+
+       ret = register_ftrace_function(&test_rec_probe);
+       if (ret) {
+               pr_cont("*could not register callback* ");
+               goto out;
+       }
+
+       DYN_FTRACE_TEST_NAME();
+
+       unregister_ftrace_function(&test_rec_probe);
+
+       ret = -1;
+       if (trace_selftest_recursion_cnt != 1) {
+               pr_cont("*callback not called once (%d)* ",
+                       trace_selftest_recursion_cnt);
+               goto out;
+       }
+
+       trace_selftest_recursion_cnt = 1;
+
+       pr_cont("PASSED\n");
+       pr_info("Testing ftrace recursion safe: ");
+
+       ret = ftrace_set_filter(&test_recsafe_probe, func_name, len, 1);
+       if (ret) {
+               pr_cont("*Could not set filter* ");
+               goto out;
+       }
+
+       ret = register_ftrace_function(&test_recsafe_probe);
+       if (ret) {
+               pr_cont("*could not register callback* ");
+               goto out;
+       }
+
+       DYN_FTRACE_TEST_NAME();
+
+       unregister_ftrace_function(&test_recsafe_probe);
+
+       /*
+        * If arch supports all ftrace features, and no other task
+        * was on the list, we should be fine.
+        */
+       if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC)
+               cnt = 2; /* Should have recursed */
+       else
+               cnt = 1;
+
+       ret = -1;
+       if (trace_selftest_recursion_cnt != cnt) {
+               pr_cont("*callback not called expected %d times (%d)* ",
+                       cnt, trace_selftest_recursion_cnt);
+               goto out;
+       }
+
+       ret = 0;
+out:
+       ftrace_enabled = save_ftrace_enabled;
+       tracer_enabled = save_tracer_enabled;
+
+       return ret;
+}
  #else
  # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
+# define trace_selftest_function_recursion() ({ 0; })
  #endif /* CONFIG_DYNAMIC_FTRACE */
  
+static enum {
+       TRACE_SELFTEST_REGS_START,
+       TRACE_SELFTEST_REGS_FOUND,
+       TRACE_SELFTEST_REGS_NOT_FOUND,
+} trace_selftest_regs_stat;
+
+static void trace_selftest_test_regs_func(unsigned long ip,
+                                         unsigned long pip,
+                                         struct ftrace_ops *op,
+                                         struct pt_regs *pt_regs)
+{
+       if (pt_regs)
+               trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND;
+       else
+               trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND;
+}
+
+static struct ftrace_ops test_regs_probe = {
+       .func           = trace_selftest_test_regs_func,
+       .flags          = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS,
+};
+
+static int
+trace_selftest_function_regs(void)
+{
+       int save_ftrace_enabled = ftrace_enabled;
+       int save_tracer_enabled = tracer_enabled;
+       char *func_name;
+       int len;
+       int ret;
+       int supported = 0;
+
+#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS
+       supported = 1;
+#endif
+
+       /* The previous test PASSED */
+       pr_cont("PASSED\n");
+       pr_info("Testing ftrace regs%s: ",
+               !supported ? "(no arch support)" : "");
+
+       /* enable tracing, and record the filter function */
+       ftrace_enabled = 1;
+       tracer_enabled = 1;
+
+       /* Handle PPC64 '.' name */
+       func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+       len = strlen(func_name);
+
+       ret = ftrace_set_filter(&test_regs_probe, func_name, len, 1);
+       /*
+        * If DYNAMIC_FTRACE is not set, then we just trace all functions.
+        * This test really doesn't care.
+        */
+       if (ret && ret != -ENODEV) {
+               pr_cont("*Could not set filter* ");
+               goto out;
+       }
+
+       ret = register_ftrace_function(&test_regs_probe);
+       /*
+        * Now if the arch does not support passing regs, then this should
+        * have failed.
+        */
+       if (!supported) {
+               if (!ret) {
+                       pr_cont("*registered save-regs without arch support* ");
+                       goto out;
+               }
+               test_regs_probe.flags |= FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED;
+               ret = register_ftrace_function(&test_regs_probe);
+       }
+       if (ret) {
+               pr_cont("*could not register callback* ");
+               goto out;
+       }
+
+
+       DYN_FTRACE_TEST_NAME();
+
+       unregister_ftrace_function(&test_regs_probe);
+
+       ret = -1;
+
+       switch (trace_selftest_regs_stat) {
+       case TRACE_SELFTEST_REGS_START:
+               pr_cont("*callback never called* ");
+               goto out;
+
+       case TRACE_SELFTEST_REGS_FOUND:
+               if (supported)
+                       break;
+               pr_cont("*callback received regs without arch support* ");
+               goto out;
+
+       case TRACE_SELFTEST_REGS_NOT_FOUND:
+               if (!supported)
+                       break;
+               pr_cont("*callback received NULL regs* ");
+               goto out;
+       }
+
+       ret = 0;
+out:
+       ftrace_enabled = save_ftrace_enabled;
+       tracer_enabled = save_tracer_enabled;
+
+       return ret;
+}
+
  /*
   * Simple verification test of ftrace function tracer.
   * Enable ftrace, sleep 1/10 second, and then read the trace
@@ -442,7 +698,14 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
  
         ret = trace_selftest_startup_dynamic_tracing(trace, tr,
                                                      DYN_FTRACE_TEST_NAME);
+       if (ret)
+               goto out;
  
+       ret = trace_selftest_function_recursion();
+       if (ret)
+               goto out;
+
+       ret = trace_selftest_function_regs();
   out:
         ftrace_enabled = save_ftrace_enabled;
         tracer_enabled = save_tracer_enabled;
@@ -778,6 +1041,8 @@ static int trace_wakeup_test_thread(void *data)
         set_current_state(TASK_INTERRUPTIBLE);
         schedule();
  
+       complete(x);
+
         /* we are awake, now wait to disappear */
         while (!kthread_should_stop()) {
                 /*
@@ -821,24 +1086,21 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
         /* reset the max latency */
         tracing_max_latency = 0;
  
-       /* sleep to let the RT thread sleep too */
-       msleep(100);
+       while (p->on_rq) {
+               /*
+                * Sleep to make sure the RT thread is asleep too.
+                * On virtual machines we can't rely on timings,
+                * but we want to make sure this test still works.
+                */
+               msleep(100);
+       }
  
-       /*
-        * Yes this is slightly racy. It is possible that for some
-        * strange reason that the RT thread we created, did not
-        * call schedule for 100ms after doing the completion,
-        * and we do a wakeup on a task that already is awake.
-        * But that is extremely unlikely, and the worst thing that
-        * happens in such a case, is that we disable tracing.
-        * Honestly, if this race does happen something is horrible
-        * wrong with the system.
-        */
+       init_completion(&isrt);
  
         wake_up_process(p);
  
-       /* give a little time to let the thread wake up */
-       msleep(100);
+       /* Wait for the task to wake up */
+       wait_for_completion(&isrt);
  
         /* stop the tracing. */
         tracing_stop();
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c

index d4545f49242e2baa96a46b01f528f5fe94445742..0c1b165778e56cb81b87a62d9d163b6250d21a3d 100644 (file)
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -111,7 +111,8 @@ static inline void check_stack(void)
  }
  
  static void
-stack_trace_call(unsigned long ip, unsigned long parent_ip)
+stack_trace_call(unsigned long ip, unsigned long parent_ip,
+                struct ftrace_ops *op, struct pt_regs *pt_regs)
  {
         int cpu;
  
@@ -136,6 +137,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
  static struct ftrace_ops trace_ops __read_mostly =
  {
         .func = stack_trace_call,
+       .flags = FTRACE_OPS_FL_RECURSION_SAFE,
  };
  
  static ssize_t
diff --git a/kernel/watchdog.c b/kernel/watchdog.c

index 4b1dfba70f7cf8ae7397623656a9b695028f702a..9d4c8d5a1f538b25b6698483ed37080ecb7652fc 100644 (file)
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -22,6 +22,7 @@
  #include <linux/notifier.h>
  #include <linux/module.h>
  #include <linux/sysctl.h>
+#include <linux/smpboot.h>
  
  #include <asm/irq_regs.h>
  #include <linux/kvm_para.h>
@@ -29,16 +30,18 @@
  
  int watchdog_enabled = 1;
  int __read_mostly watchdog_thresh = 10;
+static int __read_mostly watchdog_disabled;
  
  static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
  static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
  static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
  static DEFINE_PER_CPU(bool, softlockup_touch_sync);
  static DEFINE_PER_CPU(bool, soft_watchdog_warn);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
+static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
  #ifdef CONFIG_HARDLOCKUP_DETECTOR
  static DEFINE_PER_CPU(bool, hard_watchdog_warn);
  static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
-static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
  static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
  static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
  #endif
@@ -248,13 +251,15 @@ static void watchdog_overflow_callback(struct perf_event *event,
         __this_cpu_write(hard_watchdog_warn, false);
         return;
  }
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
  static void watchdog_interrupt_count(void)
  {
         __this_cpu_inc(hrtimer_interrupts);
  }
-#else
-static inline void watchdog_interrupt_count(void) { return; }
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
+static int watchdog_nmi_enable(unsigned int cpu);
+static void watchdog_nmi_disable(unsigned int cpu);
  
  /* watchdog kicker functions */
  static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
@@ -327,49 +332,68 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
         return HRTIMER_RESTART;
  }
  
+static void watchdog_set_prio(unsigned int policy, unsigned int prio)
+{
+       struct sched_param param = { .sched_priority = prio };
  
-/*
- * The watchdog thread - touches the timestamp.
- */
-static int watchdog(void *unused)
+       sched_setscheduler(current, policy, &param);
+}
+
+static void watchdog_enable(unsigned int cpu)
  {
-       struct sched_param param = { .sched_priority = 0 };
         struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
  
-       /* initialize timestamp */
-       __touch_watchdog();
+       if (!watchdog_enabled) {
+               kthread_park(current);
+               return;
+       }
+
+       /* Enable the perf event */
+       watchdog_nmi_enable(cpu);
  
         /* kick off the timer for the hardlockup detector */
+       hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hrtimer->function = watchdog_timer_fn;
+
         /* done here because hrtimer_start can only pin to smp_processor_id() */
         hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
                       HRTIMER_MODE_REL_PINNED);
  
-       set_current_state(TASK_INTERRUPTIBLE);
-       /*
-        * Run briefly (kicked by the hrtimer callback function) once every
-        * get_sample_period() seconds (4 seconds by default) to reset the
-        * softlockup timestamp. If this gets delayed for more than
-        * 2*watchdog_thresh seconds then the debug-printout triggers in
-        * watchdog_timer_fn().
-        */
-       while (!kthread_should_stop()) {
-               __touch_watchdog();
-               schedule();
+       /* initialize timestamp */
+       watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
+       __touch_watchdog();
+}
  
-               if (kthread_should_stop())
-                       break;
+static void watchdog_disable(unsigned int cpu)
+{
+       struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
  
-               set_current_state(TASK_INTERRUPTIBLE);
-       }
-       /*
-        * Drop the policy/priority elevation during thread exit to avoid a
-        * scheduling latency spike.
-        */
-       __set_current_state(TASK_RUNNING);
-       sched_setscheduler(current, SCHED_NORMAL, &param);
-       return 0;
+       watchdog_set_prio(SCHED_NORMAL, 0);
+       hrtimer_cancel(hrtimer);
+       /* disable the perf event */
+       watchdog_nmi_disable(cpu);
  }
  
+static int watchdog_should_run(unsigned int cpu)
+{
+       return __this_cpu_read(hrtimer_interrupts) !=
+               __this_cpu_read(soft_lockup_hrtimer_cnt);
+}
+
+/*
+ * The watchdog thread function - touches the timestamp.
+ *
+ * It only runs once every get_sample_period() seconds (4 seconds by
+ * default) to reset the softlockup timestamp. If this gets delayed
+ * for more than 2*watchdog_thresh seconds then the debug-printout
+ * triggers in watchdog_timer_fn().
+ */
+static void watchdog(unsigned int cpu)
+{
+       __this_cpu_write(soft_lockup_hrtimer_cnt,
+                        __this_cpu_read(hrtimer_interrupts));
+       __touch_watchdog();
+}
  
  #ifdef CONFIG_HARDLOCKUP_DETECTOR
  /*
@@ -379,7 +403,7 @@ static int watchdog(void *unused)
   */
  static unsigned long cpu0_err;
  
-static int watchdog_nmi_enable(int cpu)
+static int watchdog_nmi_enable(unsigned int cpu)
  {
         struct perf_event_attr *wd_attr;
         struct perf_event *event = per_cpu(watchdog_ev, cpu);
@@ -433,7 +457,7 @@ out:
         return 0;
  }
  
-static void watchdog_nmi_disable(int cpu)
+static void watchdog_nmi_disable(unsigned int cpu)
  {
         struct perf_event *event = per_cpu(watchdog_ev, cpu);
  
@@ -447,107 +471,35 @@ static void watchdog_nmi_disable(int cpu)
         return;
  }
  #else
-static int watchdog_nmi_enable(int cpu) { return 0; }
-static void watchdog_nmi_disable(int cpu) { return; }
+static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
+static void watchdog_nmi_disable(unsigned int cpu) { return; }
  #endif /* CONFIG_HARDLOCKUP_DETECTOR */
  
  /* prepare/enable/disable routines */
-static void watchdog_prepare_cpu(int cpu)
-{
-       struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
-
-       WARN_ON(per_cpu(softlockup_watchdog, cpu));
-       hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       hrtimer->function = watchdog_timer_fn;
-}
-
-static int watchdog_enable(int cpu)
-{
-       struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
-       int err = 0;
-
-       /* enable the perf event */
-       err = watchdog_nmi_enable(cpu);
-
-       /* Regardless of err above, fall through and start softlockup */
-
-       /* create the watchdog thread */
-       if (!p) {
-               struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-               p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
-               if (IS_ERR(p)) {
-                       pr_err("softlockup watchdog for %i failed\n", cpu);
-                       if (!err) {
-                               /* if hardlockup hasn't already set this */
-                               err = PTR_ERR(p);
-                               /* and disable the perf event */
-                               watchdog_nmi_disable(cpu);
-                       }
-                       goto out;
-               }
-               sched_setscheduler(p, SCHED_FIFO, &param);
-               kthread_bind(p, cpu);
-               per_cpu(watchdog_touch_ts, cpu) = 0;
-               per_cpu(softlockup_watchdog, cpu) = p;
-               wake_up_process(p);
-       }
-
-out:
-       return err;
-}
-
-static void watchdog_disable(int cpu)
-{
-       struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
-       struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
-
-       /*
-        * cancel the timer first to stop incrementing the stats
-        * and waking up the kthread
-        */
-       hrtimer_cancel(hrtimer);
-
-       /* disable the perf event */
-       watchdog_nmi_disable(cpu);
-
-       /* stop the watchdog thread */
-       if (p) {
-               per_cpu(softlockup_watchdog, cpu) = NULL;
-               kthread_stop(p);
-       }
-}
-
  /* sysctl functions */
  #ifdef CONFIG_SYSCTL
  static void watchdog_enable_all_cpus(void)
  {
-       int cpu;
-
-       watchdog_enabled = 0;
-
-       for_each_online_cpu(cpu)
-               if (!watchdog_enable(cpu))
-                       /* if any cpu succeeds, watchdog is considered
-                          enabled for the system */
-                       watchdog_enabled = 1;
-
-       if (!watchdog_enabled)
-               pr_err("failed to be enabled on some cpus\n");
+       unsigned int cpu;
  
+       if (watchdog_disabled) {
+               watchdog_disabled = 0;
+               for_each_online_cpu(cpu)
+                       kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+       }
  }
  
  static void watchdog_disable_all_cpus(void)
  {
-       int cpu;
-
-       for_each_online_cpu(cpu)
-               watchdog_disable(cpu);
+       unsigned int cpu;
  
-       /* if all watchdogs are disabled, then they are disabled for the system */
-       watchdog_enabled = 0;
+       if (!watchdog_disabled) {
+               watchdog_disabled = 1;
+               for_each_online_cpu(cpu)
+                       kthread_park(per_cpu(softlockup_watchdog, cpu));
+       }
  }
  
-
  /*
   * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
   */
@@ -557,73 +509,36 @@ int proc_dowatchdog(struct ctl_table *table, int write,
  {
         int ret;
  
+       if (watchdog_disabled < 0)
+               return -ENODEV;
+
         ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
         if (ret || !write)
-               goto out;
+               return ret;
  
         if (watchdog_enabled && watchdog_thresh)
                 watchdog_enable_all_cpus();
         else
                 watchdog_disable_all_cpus();
  
-out:
         return ret;
  }
  #endif /* CONFIG_SYSCTL */
  
-
-/*
- * Create/destroy watchdog threads as CPUs come and go:
- */
-static int __cpuinit
-cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-       int hotcpu = (unsigned long)hcpu;
-
-       switch (action) {
-       case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
-               watchdog_prepare_cpu(hotcpu);
-               break;
-       case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
-               if (watchdog_enabled)
-                       watchdog_enable(hotcpu);
-               break;
-#ifdef CONFIG_HOTPLUG_CPU
-       case CPU_UP_CANCELED:
-       case CPU_UP_CANCELED_FROZEN:
-               watchdog_disable(hotcpu);
-               break;
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               watchdog_disable(hotcpu);
-               break;
-#endif /* CONFIG_HOTPLUG_CPU */
-       }
-
-       /*
-        * hardlockup and softlockup are not important enough
-        * to block cpu bring up.  Just always succeed and
-        * rely on printk output to flag problems.
-        */
-       return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata cpu_nfb = {
-       .notifier_call = cpu_callback
+static struct smp_hotplug_thread watchdog_threads = {
+       .store                  = &softlockup_watchdog,
+       .thread_should_run      = watchdog_should_run,
+       .thread_fn              = watchdog,
+       .thread_comm            = "watchdog/%u",
+       .setup                  = watchdog_enable,
+       .park                   = watchdog_disable,
+       .unpark                 = watchdog_enable,
  };
  
  void __init lockup_detector_init(void)
  {
-       void *cpu = (void *)(long)smp_processor_id();
-       int err;
-
-       err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
-       WARN_ON(notifier_to_errno(err));
-
-       cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
-       register_cpu_notifier(&cpu_nfb);
-
-       return;
+       if (smpboot_register_percpu_thread(&watchdog_threads)) {
+               pr_err("Failed to create watchdog threads, disabled\n");
+               watchdog_disabled = -ENODEV;
+       }
  }
diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile

index 77d53999ffb90f51bea3f0395be08e495e3b75b8..1c932a2c1d83f488ea35a964c90d8bfd5f99dde7 100644 (file)
--- a/scripts/kconfig/Makefile
+++ b/scripts/kconfig/Makefile
@@ -3,7 +3,7 @@
  # These targets are used from top-level makefile
  
  PHONY += oldconfig xconfig gconfig menuconfig config silentoldconfig update-po-config \
-       localmodconfig localyesconfig
+       localmodconfig localyesconfig kvmconfig
  
  ifdef KBUILD_KCONFIG
  Kconfig := $(KBUILD_KCONFIG)
@@ -33,6 +33,11 @@ silentoldconfig: $(obj)/conf
         $(Q)mkdir -p include/generated
         $< --$@ $(Kconfig)
  
+kvmconfig:
+       $(Q)$(CONFIG_SHELL) $(srctree)/scripts/config -e KVMTOOL_TEST_ENABLE
+       $(Q)yes "" | make oldconfig > /dev/null
+       @echo 'Kernel configuration modified to run as KVM guest.'
+
  localyesconfig localmodconfig: $(obj)/streamline_config.pl $(obj)/conf
         $(Q)mkdir -p include/generated
         $(Q)perl $< --$@ $(srctree) $(Kconfig) > .tmp.config
diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h

index 54e35c1e5948c521f9fdebf57c27072b6d844f34..9d1421e63ff82939b249382b17ab886ce4c4ccc9 100644 (file)
--- a/scripts/recordmcount.h
+++ b/scripts/recordmcount.h
@@ -261,11 +261,13 @@ static unsigned get_mcountsym(Elf_Sym const *const sym0,
                 &sym0[Elf_r_sym(relp)];
         char const *symname = &str0[w(symp->st_name)];
         char const *mcount = gpfx == '_' ? "_mcount" : "mcount";
+       char const *fentry = "__fentry__";
  
         if (symname[0] == '.')
                 ++symname;  /* ppc64 hack */
         if (strcmp(mcount, symname) == 0 ||
-           (altmcount && strcmp(altmcount, symname) == 0))
+           (altmcount && strcmp(altmcount, symname) == 0) ||
+           (strcmp(fentry, symname) == 0))
                 mcountsym = Elf_r_sym(relp);
  
         return mcountsym;
diff --git a/tools/kvm/.gitignore b/tools/kvm/.gitignore

new file mode 100644 (file)

index 0000000..60dd6db
--- /dev/null
+++ b/tools/kvm/.gitignore
@@ -0,0 +1,12 @@
+/lkvm
+/vm
+*.o
+*.d
+.cscope
+tags
+include/common-cmds.h
+tests/boot/boot_test.iso
+tests/boot/rootfs/
+guest/init
+guest/init_stage2
+KVMTOOLS-VERSION-FILE
diff --git a/tools/kvm/CREDITS-Git b/tools/kvm/CREDITS-Git

new file mode 100644 (file)

index 0000000..c2ddcb3
--- /dev/null
+++ b/tools/kvm/CREDITS-Git
@@ -0,0 +1,30 @@
+Most of the infrastructure that 'perf' uses here has been reused
+from the Git project, as of version:
+
+    66996ec: Sync with 1.6.2.4
+
+Here is an (incomplete!) list of main contributors to those files
+in util/* and elsewhere:
+
+ Alex Riesen
+ Christian Couder
+ Dmitry Potapov
+ Jeff King
+ Johannes Schindelin
+ Johannes Sixt
+ Junio C Hamano
+ Linus Torvalds
+ Matthias Kestenholz
+ Michal Ostrowski
+ Miklos Vajna
+ Petr Baudis
+ Pierre Habouzit
+ René Scharfe
+ Samuel Tardieu
+ Shawn O. Pearce
+ Steffen Prohaska
+ Steve Haslam
+
+Thanks guys!
+
+The full history of the files can be found in the upstream Git commits.
diff --git a/tools/kvm/Documentation/kernel-debugging.txt b/tools/kvm/Documentation/kernel-debugging.txt

new file mode 100644 (file)

index 0000000..98b9438
--- /dev/null
+++ b/tools/kvm/Documentation/kernel-debugging.txt
@@ -0,0 +1,15 @@
+This document explains how to debug a guests' kernel using KGDB.
+
+1. Run the guest:
+        'lkvm run -k [vmlinuz] -p "kgdboc=ttyS1 kgdbwait" --tty 1'
+
+And see which PTY got assigned to ttyS1 (you'll see:
+'  Info: Assigned terminal 1 to pty /dev/pts/X').
+
+2. Run GDB on the host:
+        'gdb [vmlinuz]'
+
+3. Connect to the guest (from within GDB):
+        'target remote /dev/pty/X'
+
+4. Start debugging! (enter 'continue' to continue boot).
diff --git a/tools/kvm/Documentation/kvm-balloon.txt b/tools/kvm/Documentation/kvm-balloon.txt

new file mode 100644 (file)

index 0000000..efc0a87
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-balloon.txt
@@ -0,0 +1,24 @@
+lkvm-balloon(1)
+================
+
+NAME
+----
+lkvm-balloon - Inflate or deflate the virtio balloon
+
+SYNOPSIS
+--------
+[verse]
+'lkvm balloon [command] [size] [instance]'
+
+DESCRIPTION
+-----------
+The command inflates or deflates the virtio balloon located in the
+specified instance.
+For a list of running instances see 'lkvm list'.
+
+Command can be either 'inflate' or 'deflate'. Inflate increases the
+size of the balloon, thus decreasing the amount of virtual RAM available
+for the guest. Deflation returns previously inflated memory back to the
+guest.
+
+size is specified in Mb.
diff --git a/tools/kvm/Documentation/kvm-debug.txt b/tools/kvm/Documentation/kvm-debug.txt

new file mode 100644 (file)

index 0000000..a8eb2c0
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-debug.txt
@@ -0,0 +1,16 @@
+lkvm-debug(1)
+================
+
+NAME
+----
+lkvm-debug - Print debug information from a running instance
+
+SYNOPSIS
+--------
+[verse]
+'lkvm debug [instance]'
+
+DESCRIPTION
+-----------
+The command prints debug information from a running instance.
+For a list of running instances see 'lkvm list'.
diff --git a/tools/kvm/Documentation/kvm-list.txt b/tools/kvm/Documentation/kvm-list.txt

new file mode 100644 (file)

index 0000000..a245607
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-list.txt
@@ -0,0 +1,16 @@
+lkvm-list(1)
+================
+
+NAME
+----
+lkvm-list - Print a list of running instances on the host.
+
+SYNOPSIS
+--------
+[verse]
+'lkvm list'
+
+DESCRIPTION
+-----------
+This command prints a list of running instances on the host which
+belong to the user who currently ran 'lkvm list'.
diff --git a/tools/kvm/Documentation/kvm-pause.txt b/tools/kvm/Documentation/kvm-pause.txt

new file mode 100644 (file)

index 0000000..1ea2a23
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-pause.txt
@@ -0,0 +1,16 @@
+lkvm-pause(1)
+================
+
+NAME
+----
+lkvm-pause - Pause the virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'lkvm pause [instance]'
+
+DESCRIPTION
+-----------
+The command pauses a virtual machine.
+For a list of running instances see 'lkvm list'.
diff --git a/tools/kvm/Documentation/kvm-resume.txt b/tools/kvm/Documentation/kvm-resume.txt

new file mode 100644 (file)

index 0000000..a36c4df
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-resume.txt
@@ -0,0 +1,16 @@
+lkvm-resume(1)
+================
+
+NAME
+----
+lkvm-resume - Resume the virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'lkvm resume [instance]'
+
+DESCRIPTION
+-----------
+The command resumes a virtual machine.
+For a list of running instances see 'lkvm list'.
diff --git a/tools/kvm/Documentation/kvm-run.txt b/tools/kvm/Documentation/kvm-run.txt

new file mode 100644 (file)

index 0000000..8ddf470
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-run.txt
@@ -0,0 +1,62 @@
+lkvm-run(1)
+================
+
+NAME
+----
+lkvm-run - Start the virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'lkvm run' [-k <kernel image> | --kernel <kernel image>]
+
+DESCRIPTION
+-----------
+The command starts a virtual machine.
+
+OPTIONS
+-------
+-m::
+--mem=::
+       Virtual machine memory size in MiB.
+
+-p::
+--params::
+       Additional kernel command line arguments.
+
+-r::
+--initrd=::
+       Initial RAM disk image.
+
+-k::
+--kernel=::
+       The virtual machine kernel.
+
+--dev=::
+       KVM device file.
+
+-i::
+--image=::
+       A disk image file.
+
+-s::
+--single-step::
+       Enable single stepping.
+
+-g::
+--ioport-debug::
+       Enable ioport debugging.
+
+-c::
+--enable-virtio-console::
+       Enable the virtual IO console.
+
+--cpus::
+       The number of virtual CPUs to run.
+
+--debug::
+       Enable debug messages.
+
+SEE ALSO
+--------
+linkkvm:
diff --git a/tools/kvm/Documentation/kvm-sandbox.txt b/tools/kvm/Documentation/kvm-sandbox.txt

new file mode 100644 (file)

index 0000000..2d7f558
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-sandbox.txt
@@ -0,0 +1,16 @@
+lkvm-sandbox(1)
+================
+
+NAME
+----
+lkvm-sandbox - Run a command in a sandboxed guest
+
+SYNOPSIS
+--------
+[verse]
+'lkvm sandbox ['lkvm run' arguments] -- [sandboxed command]'
+
+DESCRIPTION
+-----------
+The sandboxed command will run in a guest as part of it's init
+command.
diff --git a/tools/kvm/Documentation/kvm-setup.txt b/tools/kvm/Documentation/kvm-setup.txt

new file mode 100644 (file)

index 0000000..4b6e331
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-setup.txt
@@ -0,0 +1,15 @@
+lkvm-setup(1)
+================
+
+NAME
+----
+lkvm-setup - Setup a new virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'lkvm setup <name>'
+
+DESCRIPTION
+-----------
+The command setups a virtual machine.
diff --git a/tools/kvm/Documentation/kvm-stat.txt b/tools/kvm/Documentation/kvm-stat.txt

new file mode 100644 (file)

index 0000000..101ce7a
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-stat.txt
@@ -0,0 +1,19 @@
+lkvm-stat(1)
+================
+
+NAME
+----
+lkvm-stat - Print statistics about a running instance
+
+SYNOPSIS
+--------
+[verse]
+'lkvm [command] [-n instance] [-p instance pid] [--all]'
+
+DESCRIPTION
+-----------
+The command prints statistics about a running instance.
+For a list of running instances see 'lkvm list'.
+
+Commands:
+ --memory, -m  Display memory statistics
diff --git a/tools/kvm/Documentation/kvm-stop.txt b/tools/kvm/Documentation/kvm-stop.txt

new file mode 100644 (file)

index 0000000..6e4bc83
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-stop.txt
@@ -0,0 +1,16 @@
+lkvm-stop(1)
+================
+
+NAME
+----
+lkvm-stop - Stop a running instance
+
+SYNOPSIS
+--------
+[verse]
+'lkvm stop [instance]'
+
+DESCRIPTION
+-----------
+The command stops a running instance.
+For a list of running instances see 'lkvm list'.
diff --git a/tools/kvm/Documentation/kvm-version.txt b/tools/kvm/Documentation/kvm-version.txt

new file mode 100644 (file)

index 0000000..41003d2
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-version.txt
@@ -0,0 +1,21 @@
+lkvm-version(1)
+================
+
+NAME
+----
+lkvm-version - Print the version of the kernel tree kvm tools
+was built on.
+
+SYNOPSIS
+--------
+[verse]
+'lkvm version'
+
+DESCRIPTION
+-----------
+The command prints the version of the kernel that was used to build
+kvm tools.
+
+Note that the version is not the version of the kernel which is currently
+running on the host, but is the version of the kernel tree from which kvm
+tools was built.
diff --git a/tools/kvm/Documentation/virtio-console.txt b/tools/kvm/Documentation/virtio-console.txt

new file mode 100644 (file)

index 0000000..4a58d56
--- /dev/null
+++ b/tools/kvm/Documentation/virtio-console.txt
@@ -0,0 +1,41 @@
+General
+--------
+
+virtio-console as the name implies is a console over virtio transport. Here is
+a simple head to head comparison of the virtio-console vs regular 8250 console:
+
+8250 serial console:
+
+ - Requires CONFIG_SERIAL_8250=y and CONFIG_SERIAL_8250_CONSOLE=y kernel configs,
+which are enabled almost everywhere.
+ - Doesn't require guest-side changes.
+ - Compatible with older guests.
+
+virtio-console:
+
+ - Requires CONFIG_VIRTIO_CONSOLE=y (along with all other virtio dependencies),
+which got enabled only in recent kernels (but not all of them).
+ - Much faster.
+ - Consumes less processing resources.
+ - Requires guest-side changes.
+
+Enabling virtio-console
+------------------------
+
+First, make sure guest kernel is built with CONFIG_VIRTIO_CONSOLE=y. Once this
+is done, the following has to be done inside guest image:
+
+ - Add the following line to /etc/inittab:
+       'hvc0:2345:respawn:/sbin/agetty -L 9600 hvc0'
+ - Add 'hvc0' to /etc/securetty (so you could actually log on)
+ - Start the guest with '--console virtio'
+
+Common errors
+--------------
+
+Q: I don't see anything on the screen!
+A: Make sure CONFIG_VIRTIO_CONSOLE=y is enabled in the *guest* kernel, also
+make sure you've updated /etc/inittab
+
+Q: It won't accept my username/password, but I enter them correctly!
+A: You didn't add 'hvc0' to /etc/securetty
diff --git a/tools/kvm/Makefile b/tools/kvm/Makefile

new file mode 100644 (file)

index 0000000..0e2fa66
--- /dev/null
+++ b/tools/kvm/Makefile
@@ -0,0 +1,440 @@
+#
+# Define WERROR=0 to disable -Werror.
+#
+
+ifeq ($(strip $(V)),)
+       E = @echo
+       Q = @
+else
+       E = @\#
+       Q =
+endif
+ifneq ($(I), )
+       KINCL_PATH=$(I)
+else
+       KINCL_PATH=../..
+endif
+export E Q KINCL_PATH
+
+include config/utilities.mak
+include config/feature-tests.mak
+
+CC     := $(CROSS_COMPILE)$(CC)
+LD     := $(CROSS_COMPILE)$(LD)
+
+FIND   := find
+CSCOPE := cscope
+TAGS   := ctags
+INSTALL := install
+
+prefix = $(HOME)
+bindir_relative = bin
+bindir = $(prefix)/$(bindir_relative)
+
+DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
+bindir_SQ = $(subst ','\'',$(bindir))
+
+PROGRAM        := lkvm
+PROGRAM_ALIAS := vm
+
+GUEST_INIT := guest/init
+
+OBJS   += builtin-balloon.o
+OBJS   += builtin-debug.o
+OBJS   += builtin-help.o
+OBJS   += builtin-list.o
+OBJS   += builtin-stat.o
+OBJS   += builtin-pause.o
+OBJS   += builtin-resume.o
+OBJS   += builtin-run.o
+OBJS   += builtin-setup.o
+OBJS   += builtin-stop.o
+OBJS   += builtin-version.o
+OBJS   += disk/core.o
+OBJS   += framebuffer.o
+OBJS   += guest_compat.o
+OBJS   += hw/rtc.o
+OBJS   += hw/serial.o
+OBJS   += ioport.o
+OBJS   += kvm-cpu.o
+OBJS   += kvm.o
+OBJS   += main.o
+OBJS   += mmio.o
+OBJS   += pci.o
+OBJS   += term.o
+OBJS   += virtio/blk.o
+OBJS   += virtio/scsi.o
+OBJS   += virtio/console.o
+OBJS   += virtio/core.o
+OBJS   += virtio/net.o
+OBJS   += virtio/rng.o
+OBJS    += virtio/balloon.o
+OBJS   += virtio/pci.o
+OBJS   += disk/blk.o
+OBJS   += disk/qcow.o
+OBJS   += disk/raw.o
+OBJS   += ioeventfd.o
+OBJS   += net/uip/core.o
+OBJS   += net/uip/arp.o
+OBJS   += net/uip/icmp.o
+OBJS   += net/uip/ipv4.o
+OBJS   += net/uip/tcp.o
+OBJS   += net/uip/udp.o
+OBJS   += net/uip/buf.o
+OBJS   += net/uip/csum.o
+OBJS   += net/uip/dhcp.o
+OBJS   += kvm-cmd.o
+OBJS   += util/rbtree.o
+OBJS   += util/threadpool.o
+OBJS   += util/parse-options.o
+OBJS   += util/rbtree-interval.o
+OBJS   += util/strbuf.o
+OBJS   += util/read-write.o
+OBJS   += util/util.o
+OBJS   += virtio/9p.o
+OBJS   += virtio/9p-pdu.o
+OBJS   += hw/vesa.o
+OBJS   += hw/pci-shmem.o
+OBJS   += kvm-ipc.o
+OBJS   += builtin-sandbox.o
+OBJS   += virtio/mmio.o
+
+# Translate uname -m into ARCH string
+ARCH ?= $(shell uname -m | sed -e s/i.86/i386/ -e s/ppc.*/powerpc/)
+
+ifeq ($(ARCH),i386)
+       ARCH         := x86
+       DEFINES      += -DCONFIG_X86_32
+endif
+ifeq ($(ARCH),x86_64)
+       ARCH         := x86
+       DEFINES      += -DCONFIG_X86_64
+endif
+
+LIBFDT_SRC = fdt.o fdt_ro.o fdt_wip.o fdt_sw.o fdt_rw.o fdt_strerror.o
+LIBFDT_OBJS = $(patsubst %,../../scripts/dtc/libfdt/%,$(LIBFDT_SRC))
+
+### Arch-specific stuff
+
+#x86
+ifeq ($(ARCH),x86)
+       DEFINES += -DCONFIG_X86
+       OBJS    += x86/boot.o
+       OBJS    += x86/cpuid.o
+       OBJS    += x86/interrupt.o
+       OBJS    += x86/ioport.o
+       OBJS    += x86/irq.o
+       OBJS    += x86/kvm.o
+       OBJS    += x86/kvm-cpu.o
+       OBJS    += x86/mptable.o
+       OBJS    += hw/i8042.o
+# Exclude BIOS object files from header dependencies.
+       OTHEROBJS       += x86/bios.o
+       OTHEROBJS       += x86/bios/bios-rom.o
+       ARCH_INCLUDE := x86/include
+endif
+# POWER/ppc:  Actually only support ppc64 currently.
+ifeq ($(ARCH), powerpc)
+       DEFINES += -DCONFIG_PPC
+       OBJS    += powerpc/boot.o
+       OBJS    += powerpc/ioport.o
+       OBJS    += powerpc/irq.o
+       OBJS    += powerpc/kvm.o
+       OBJS    += powerpc/cpu_info.o
+       OBJS    += powerpc/kvm-cpu.o
+       OBJS    += powerpc/spapr_hcall.o
+       OBJS    += powerpc/spapr_rtas.o
+       OBJS    += powerpc/spapr_hvcons.o
+       OBJS    += powerpc/spapr_pci.o
+       OBJS    += powerpc/xics.o
+# We use libfdt, but it's sometimes not packaged 64bit.  It's small too,
+# so just build it in:
+       CFLAGS  += -I../../scripts/dtc/libfdt
+       OTHEROBJS       += $(LIBFDT_OBJS)
+       ARCH_INCLUDE := powerpc/include
+       CFLAGS  += -m64
+endif
+
+###
+
+ifeq (,$(ARCH_INCLUDE))
+       UNSUPP_ERR = @echo "This architecture is not supported in kvmtool." && exit 1
+else
+       UNSUPP_ERR =
+endif
+
+###
+
+# Detect optional features.
+# On a given system, some libs may link statically, some may not; so, check
+# both and only build those that link!
+
+FLAGS_BFD := $(CFLAGS) -lbfd
+ifeq ($(call try-cc,$(SOURCE_BFD),$(FLAGS_BFD)),y)
+       CFLAGS_DYNOPT   += -DCONFIG_HAS_BFD
+       OBJS_DYNOPT     += symbol.o
+       LIBS_DYNOPT     += -lbfd
+endif
+ifeq ($(call try-cc,$(SOURCE_BFD),$(FLAGS_BFD) -static),y)
+       CFLAGS_STATOPT  += -DCONFIG_HAS_BFD
+       OBJS_STATOPT    += symbol.o
+       LIBS_STATOPT    += -lbfd
+endif
+
+FLAGS_VNCSERVER := $(CFLAGS) -lvncserver
+ifeq ($(call try-cc,$(SOURCE_VNCSERVER),$(FLAGS_VNCSERVER)),y)
+       OBJS_DYNOPT     += ui/vnc.o
+       CFLAGS_DYNOPT   += -DCONFIG_HAS_VNCSERVER
+       LIBS_DYNOPT     += -lvncserver
+endif
+ifeq ($(call try-cc,$(SOURCE_VNCSERVER),$(FLAGS_VNCSERVER) -static),y)
+       OBJS_STATOPT    += ui/vnc.o
+       CFLAGS_STATOPT  += -DCONFIG_HAS_VNCSERVER
+       LIBS_STATOPT    += -lvncserver
+endif
+
+FLAGS_SDL := $(CFLAGS) -lSDL
+ifeq ($(call try-cc,$(SOURCE_SDL),$(FLAGS_SDL)),y)
+       OBJS_DYNOPT     += ui/sdl.o
+       CFLAGS_DYNOPT   += -DCONFIG_HAS_SDL
+       LIBS_DYNOPT     += -lSDL
+endif
+ifeq ($(call try-cc,$(SOURCE_SDL),$(FLAGS_SDL) -static), y)
+       OBJS_STATOPT    += ui/sdl.o
+       CFLAGS_STATOPT  += -DCONFIG_HAS_SDL
+       LIBS_STATOPT    += -lSDL
+endif
+
+FLAGS_ZLIB := $(CFLAGS) -lz
+ifeq ($(call try-cc,$(SOURCE_ZLIB),$(FLAGS_ZLIB)),y)
+       CFLAGS_DYNOPT   += -DCONFIG_HAS_ZLIB
+       LIBS_DYNOPT     += -lz
+endif
+ifeq ($(call try-cc,$(SOURCE_ZLIB),$(FLAGS_ZLIB) -static),y)
+       CFLAGS_STATOPT  += -DCONFIG_HAS_ZLIB
+       LIBS_STATOPT    += -lz
+endif
+
+FLAGS_AIO := $(CFLAGS) -laio
+ifeq ($(call try-cc,$(SOURCE_AIO),$(FLAGS_AIO)),y)
+       CFLAGS_DYNOPT   += -DCONFIG_HAS_AIO
+       LIBS_DYNOPT     += -laio
+endif
+ifeq ($(call try-cc,$(SOURCE_AIO),$(FLAGS_AIO) -static),y)
+       CFLAGS_STATOPT  += -DCONFIG_HAS_AIO
+       LIBS_STATOPT    += -laio
+endif
+
+ifneq ($(call try-build,$(SOURCE_STATIC),-static,),y)
+$(error No static libc found. Please install glibc-static package.)
+endif
+###
+
+LIBS   += -lrt
+LIBS   += -lpthread
+LIBS   += -lutil
+
+
+DEPS   := $(patsubst %.o,%.d,$(OBJS))
+
+DEFINES        += -D_FILE_OFFSET_BITS=64
+DEFINES        += -D_GNU_SOURCE
+DEFINES        += -DKVMTOOLS_VERSION='"$(KVMTOOLS_VERSION)"'
+DEFINES        += -DBUILD_ARCH='"$(ARCH)"'
+
+KVM_INCLUDE := include
+CFLAGS += $(CPPFLAGS) $(DEFINES) -I$(KVM_INCLUDE) -I$(ARCH_INCLUDE) -I$(KINCL_PATH)/include -I$(KINCL_PATH)/arch/$(ARCH)/include/ -O2 -fno-strict-aliasing -g -flto
+
+WARNINGS += -Wall
+WARNINGS += -Wcast-align
+WARNINGS += -Wformat=2
+WARNINGS += -Winit-self
+WARNINGS += -Wmissing-declarations
+WARNINGS += -Wmissing-prototypes
+WARNINGS += -Wnested-externs
+WARNINGS += -Wno-system-headers
+WARNINGS += -Wold-style-definition
+WARNINGS += -Wredundant-decls
+WARNINGS += -Wsign-compare
+WARNINGS += -Wstrict-prototypes
+WARNINGS += -Wundef
+WARNINGS += -Wvolatile-register-var
+WARNINGS += -Wwrite-strings
+
+CFLAGS += $(WARNINGS)
+
+# Some targets may use 'external' sources that don't build totally cleanly.
+CFLAGS_EASYGOING := $(CFLAGS)
+
+ifneq ($(WERROR),0)
+       CFLAGS += -Werror
+endif
+
+all: arch_support_check $(PROGRAM) $(PROGRAM_ALIAS) $(GUEST_INIT)
+
+arch_support_check:
+       $(UNSUPP_ERR)
+
+KVMTOOLS-VERSION-FILE:
+       @$(SHELL_PATH) util/KVMTOOLS-VERSION-GEN $(OUTPUT)
+-include $(OUTPUT)KVMTOOLS-VERSION-FILE
+
+# When building -static all objects are built with appropriate flags, which
+# may differ between static & dynamic .o.  The objects are separated into
+# .o and .static.o.  See the %.o: %.c rules below.
+#
+# $(OTHEROBJS) are things that do not get substituted like this.
+#
+STATIC_OBJS = $(patsubst %.o,%.static.o,$(OBJS) $(OBJS_STATOPT))
+GUEST_OBJS = guest/guest_init.o
+
+$(PROGRAM)-static:  $(DEPS) $(STATIC_OBJS) $(OTHEROBJS) $(GUEST_INIT)
+       $(E) "  LINK    " $@
+       $(Q) $(CC) -static $(CFLAGS) $(STATIC_OBJS) $(OTHEROBJS) $(GUEST_OBJS) $(LIBS) $(LIBS_STATOPT) -o $@
+
+$(PROGRAM): $(DEPS) $(OBJS) $(OBJS_DYNOPT) $(OTHEROBJS) $(GUEST_INIT)
+       $(E) "  LINK    " $@
+       $(Q) $(CC) $(CFLAGS) $(OBJS) $(OBJS_DYNOPT) $(OTHEROBJS) $(GUEST_OBJS) $(LIBS) $(LIBS_DYNOPT) -o $@
+
+$(PROGRAM_ALIAS): $(PROGRAM)
+       $(E) "  LN      " $@
+       $(Q) ln -f $(PROGRAM) $@
+
+$(GUEST_INIT): guest/init.c
+       $(E) "  LINK    " $@
+       $(Q) $(CC) -static guest/init.c -o $@
+       $(Q) $(LD) -r -b binary -o guest/guest_init.o $(GUEST_INIT)
+
+$(DEPS):
+
+util/rbtree.d: ../../lib/rbtree.c
+       $(Q) $(CC) -M -MT util/rbtree.o $(CFLAGS) $< -o $@
+
+%.d: %.c
+       $(Q) $(CC) -M -MT $(patsubst %.d,%.o,$@) $(CFLAGS) $< -o $@
+
+# The header file common-cmds.h is needed for compilation of builtin-help.c.
+builtin-help.d: $(KVM_INCLUDE)/common-cmds.h
+
+$(OBJS):
+
+# This rule relaxes the -Werror on libfdt, since for now it still has
+# a bunch of warnings. :(
+../../scripts/dtc/libfdt/%.o: ../../scripts/dtc/libfdt/%.c
+       $(E) "  CC      " $@
+       $(Q) $(CC) -c $(CFLAGS_EASYGOING) $< -o $@
+
+util/rbtree.static.o util/rbtree.o: ../../lib/rbtree.c
+       $(E) "  CC      " $@
+       $(Q) $(CC) -c $(CFLAGS) $< -o $@
+
+%.static.o: %.c
+       $(E) "  CC      " $@
+       $(Q) $(CC) -c $(CFLAGS) $(CFLAGS_STATOPT)  $< -o $@
+
+%.o: %.c
+       $(E) "  CC      " $@
+       $(Q) $(CC) -c $(CFLAGS) $(CFLAGS_DYNOPT) $< -o $@
+
+
+$(KVM_INCLUDE)/common-cmds.h: util/generate-cmdlist.sh command-list.txt
+
+$(KVM_INCLUDE)/common-cmds.h: $(wildcard Documentation/kvm-*.txt)
+       $(E) "  GEN     " $@
+       $(Q) util/generate-cmdlist.sh > $@+ && mv $@+ $@
+
+#
+# BIOS assembly weirdness
+#
+BIOS_CFLAGS += -m32
+BIOS_CFLAGS += -march=i386
+BIOS_CFLAGS += -mregparm=3
+
+BIOS_CFLAGS += -fno-stack-protector
+BIOS_CFLAGS += -I../../arch/$(ARCH)
+
+x86/bios.o: x86/bios/bios.bin x86/bios/bios-rom.h
+
+x86/bios/bios.bin.elf: x86/bios/entry.S x86/bios/e820.c x86/bios/int10.c x86/bios/int15.c x86/bios/rom.ld.S
+       $(E) "  CC       x86/bios/memcpy.o"
+       $(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/memcpy.c -o x86/bios/memcpy.o
+       $(E) "  CC       x86/bios/e820.o"
+       $(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/e820.c -o x86/bios/e820.o
+       $(E) "  CC       x86/bios/int10.o"
+       $(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/int10.c -o x86/bios/int10.o
+       $(E) "  CC       x86/bios/int15.o"
+       $(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/int15.c -o x86/bios/int15.o
+       $(E) "  CC       x86/bios/entry.o"
+       $(Q) $(CC) $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/entry.S -o x86/bios/entry.o
+       $(E) "  LD      " $@
+       $(Q) $(LD) -T x86/bios/rom.ld.S -o x86/bios/bios.bin.elf x86/bios/memcpy.o x86/bios/entry.o x86/bios/e820.o x86/bios/int10.o x86/bios/int15.o
+
+x86/bios/bios.bin: x86/bios/bios.bin.elf
+       $(E) "  OBJCOPY " $@
+       $(Q) objcopy -O binary -j .text x86/bios/bios.bin.elf x86/bios/bios.bin
+
+x86/bios/bios-rom.o: x86/bios/bios-rom.S x86/bios/bios.bin x86/bios/bios-rom.h
+       $(E) "  CC      " $@
+       $(Q) $(CC) -c $(CFLAGS) x86/bios/bios-rom.S -o x86/bios/bios-rom.o
+
+x86/bios/bios-rom.h: x86/bios/bios.bin.elf
+       $(E) "  NM      " $@
+       $(Q) cd x86/bios && sh gen-offsets.sh > bios-rom.h && cd ..
+
+check: all
+       $(MAKE) -C tests
+       ./$(PROGRAM) run tests/pit/tick.bin
+       ./$(PROGRAM) run -d tests/boot/boot_test.iso -p "init=init"
+.PHONY: check
+
+install: all
+       $(E) "  INSTALL"
+       $(Q) $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)' 
+       $(Q) $(INSTALL) $(PROGRAM) '$(DESTDIR_SQ)$(bindir_SQ)' 
+.PHONY: install
+
+clean:
+       $(E) "  CLEAN"
+       $(Q) rm -f x86/bios/*.bin
+       $(Q) rm -f x86/bios/*.elf
+       $(Q) rm -f x86/bios/*.o
+       $(Q) rm -f x86/bios/bios-rom.h
+       $(Q) rm -f tests/boot/boot_test.iso
+       $(Q) rm -rf tests/boot/rootfs/
+       $(Q) rm -f $(DEPS) $(OBJS) $(OTHEROBJS) $(OBJS_DYNOPT) $(STATIC_OBJS) $(PROGRAM) $(PROGRAM_ALIAS) $(PROGRAM)-static $(GUEST_INIT) $(GUEST_OBJS)
+       $(Q) rm -f cscope.*
+       $(Q) rm -f tags
+       $(Q) rm -f TAGS
+       $(Q) rm -f $(KVM_INCLUDE)/common-cmds.h
+       $(Q) rm -f KVMTOOLS-VERSION-FILE
+.PHONY: clean
+
+KVM_DEV        ?= /dev/kvm
+
+$(KVM_DEV):
+       $(E) "  MKNOD " $@
+       $(Q) mknod $@ char 10 232
+
+devices: $(KVM_DEV)
+.PHONY: devices
+
+TAGS:
+       $(E) "  GEN" $@
+       $(Q) $(RM) -f TAGS
+       $(Q) $(FIND) . -name '*.[hcS]' -print | xargs etags -a
+.PHONY: TAGS
+
+tags:
+       $(E) "  GEN" $@
+       $(Q) $(RM) -f tags
+       $(Q) $(FIND) . -name '*.[hcS]' -print | xargs ctags -a
+.PHONY: tags
+
+cscope:
+       $(E) "  GEN" $@
+       $(Q) $(FIND) . -name '*.[hcS]' -print > cscope.files
+       $(Q) $(CSCOPE) -bkqu
+.PHONY: cscope
+
+# Deps
+-include $(DEPS)
diff --git a/tools/kvm/README b/tools/kvm/README

new file mode 100644 (file)

index 0000000..358fa23
--- /dev/null
+++ b/tools/kvm/README
@@ -0,0 +1,112 @@
+Native Linux KVM tool
+=====================
+The goal of this tool is to provide a clean, from-scratch, lightweight
+KVM host tool implementation that can boot Linux guest images (just a
+hobby, won't be big and professional like QEMU) with no BIOS
+dependencies and with only the minimal amount of legacy device
+emulation.
+
+It's great as a learning tool if you want to get your feet wet in
+virtualization land: it's only 5 KLOC of clean C code that can already
+boot a guest Linux image.
+
+Right now it can boot a Linux image and provide you output via a serial
+console, over the host terminal, i.e. you can use it to boot a guest
+Linux image in a terminal or over ssh and log into the guest without
+much guest or host side setup work needed.
+
+1. To try out the tool, clone the git repository:
+
+  git clone git://github.com/penberg/linux-kvm.git
+
+or alternatively, if you already have a kernel source tree:
+
+  git remote add kvm-tool git://github.com/penberg/linux-kvm.git
+  git remote update
+  git checkout -b kvm-tool/master kvm-tool
+
+2. Compile the tool:
+
+  cd tools/kvm && make
+
+3. Download a raw userspace image:
+
+  wget http://wiki.qemu.org/download/linux-0.2.img.bz2 && bunzip2
+linux-0.2.img.bz2
+
+4. The guest kernel has to be built with the following configuration:
+
+ - For the default console output:
+       CONFIG_SERIAL_8250=y
+       CONFIG_SERIAL_8250_CONSOLE=y
+
+ - For running 32bit images on 64bit hosts:
+       CONFIG_IA32_EMULATION=y
+
+ - Proper FS options according to image FS (e.g. CONFIG_EXT2_FS, CONFIG_EXT4_FS).
+
+ - For all virtio devices listed below:
+       CONFIG_VIRTIO=y
+       CONFIG_VIRTIO_RING=y
+       CONFIG_VIRTIO_PCI=y
+
+ - For virtio-blk devices (--disk, -d):
+       CONFIG_VIRTIO_BLK=y
+
+ - For virtio-net devices ([--network, -n] virtio):
+       CONFIG_VIRTIO_NET=y
+
+ - For virtio-9p devices (--virtio-9p):
+       CONFIG_NET_9P=y
+       CONFIG_NET_9P_VIRTIO=y
+       CONFIG_9P_FS=y
+
+ - For virtio-balloon device (--balloon):
+       CONFIG_VIRTIO_BALLOON=y
+
+ - For virtio-console device (--console virtio):
+       CONFIG_VIRTIO_CONSOLE=y
+
+ - For virtio-rng device (--rng):
+       CONFIG_HW_RANDOM_VIRTIO=y
+
+ - For vesa device (--sdl or --vnc):
+       CONFIG_FB_VESA=y
+
+
+5. And finally, launch the hypervisor:
+
+  ./lkvm run --disk linux-0.2.img \
+           --kernel ../../arch/x86/boot/bzImage \
+or
+
+  sudo ./lkvm run --disk linux-0.2.img \
+                --kernel ../../arch/x86/boot/bzImage \
+                --network virtio
+
+The tool has been written by Pekka Enberg, Cyrill Gorcunov, Asias He,
+Sasha Levin and Prasad Joshi. Special thanks to Avi Kivity for his help
+on KVM internals and Ingo Molnar for all-around support and encouragement!
+
+See the following thread for original discussion for motivation of this
+project:
+
+http://thread.gmane.org/gmane.linux.kernel/962051/focus=962620
+
+Build dependencies
+=====================
+For deb based systems:
+32-bit:
+sudo apt-get install build-essential
+64-bit:
+sudo apt-get install build-essential libc6-dev-i386
+
+For rpm based systems:
+32-bit:
+yum install glibc-devel
+64-bit:
+yum install glibc-devel glibc-static
+
+On 64-bit Arch Linux make sure the multilib repository is enabled in your
+/etc/pacman.conf and run
+pacman -Sy lib32-glibc
diff --git a/tools/kvm/builtin-balloon.c b/tools/kvm/builtin-balloon.c

new file mode 100644 (file)

index 0000000..5bd2291
--- /dev/null
+++ b/tools/kvm/builtin-balloon.c
@@ -0,0 +1,80 @@
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-balloon.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm.h>
+#include <kvm/kvm-ipc.h>
+
+static const char *instance_name;
+static u64 inflate;
+static u64 deflate;
+
+static const char * const balloon_usage[] = {
+       "lkvm balloon [-n name] [-p pid] [-i amount] [-d amount]",
+       NULL
+};
+
+static const struct option balloon_options[] = {
+       OPT_GROUP("Instance options:"),
+       OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+       OPT_GROUP("Balloon options:"),
+       OPT_U64('i', "inflate", &inflate, "Amount to inflate"),
+       OPT_U64('d', "deflate", &deflate, "Amount to deflate"),
+       OPT_END(),
+};
+
+void kvm_balloon_help(void)
+{
+       usage_with_options(balloon_usage, balloon_options);
+}
+
+static void parse_balloon_options(int argc, const char **argv)
+{
+       while (argc != 0) {
+               argc = parse_options(argc, argv, balloon_options, balloon_usage,
+                               PARSE_OPT_STOP_AT_NON_OPTION);
+               if (argc != 0)
+                       kvm_balloon_help();
+       }
+}
+
+int kvm_cmd_balloon(int argc, const char **argv, const char *prefix)
+{
+       int instance;
+       int r;
+       int amount;
+
+       parse_balloon_options(argc, argv);
+
+       if (inflate == 0 && deflate == 0)
+               kvm_balloon_help();
+
+       if (instance_name == NULL)
+               kvm_balloon_help();
+
+       instance = kvm__get_sock_by_instance(instance_name);
+
+       if (instance <= 0)
+               die("Failed locating instance");
+
+       if (inflate)
+               amount = inflate;
+       else if (deflate)
+               amount = -deflate;
+       else
+               kvm_balloon_help();
+
+       r = kvm_ipc__send_msg(instance, KVM_IPC_BALLOON,
+                       sizeof(amount), (u8 *)&amount);
+
+       close(instance);
+
+       if (r < 0)
+               return -1;
+
+       return 0;
+}
diff --git a/tools/kvm/builtin-debug.c b/tools/kvm/builtin-debug.c

new file mode 100644 (file)

index 0000000..4ae51d2
--- /dev/null
+++ b/tools/kvm/builtin-debug.c
@@ -0,0 +1,110 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-debug.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+#include <kvm/read-write.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+#define BUFFER_SIZE 100
+
+static bool all;
+static int nmi = -1;
+static bool dump;
+static const char *instance_name;
+static const char *sysrq;
+
+static const char * const debug_usage[] = {
+       "lkvm debug [--all] [-n name] [-d] [-m vcpu]",
+       NULL
+};
+
+static const struct option debug_options[] = {
+       OPT_GROUP("General options:"),
+       OPT_BOOLEAN('d', "dump", &dump, "Generate a debug dump from guest"),
+       OPT_INTEGER('m', "nmi", &nmi, "Generate NMI on VCPU"),
+       OPT_STRING('s', "sysrq", &sysrq, "sysrq", "Inject a sysrq"),
+       OPT_GROUP("Instance options:"),
+       OPT_BOOLEAN('a', "all", &all, "Debug all instances"),
+       OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+       OPT_END()
+};
+
+static void parse_debug_options(int argc, const char **argv)
+{
+       while (argc != 0) {
+               argc = parse_options(argc, argv, debug_options, debug_usage,
+                               PARSE_OPT_STOP_AT_NON_OPTION);
+               if (argc != 0)
+                       kvm_debug_help();
+       }
+}
+
+void kvm_debug_help(void)
+{
+       usage_with_options(debug_usage, debug_options);
+}
+
+static int do_debug(const char *name, int sock)
+{
+       char buff[BUFFER_SIZE];
+       struct debug_cmd_params cmd = {.dbg_type = 0};
+       int r;
+
+       if (dump)
+               cmd.dbg_type |= KVM_DEBUG_CMD_TYPE_DUMP;
+
+       if (nmi != -1) {
+               cmd.dbg_type |= KVM_DEBUG_CMD_TYPE_NMI;
+               cmd.cpu = nmi;
+       }
+
+       if (sysrq) {
+               cmd.dbg_type |= KVM_DEBUG_CMD_TYPE_SYSRQ;
+               cmd.sysrq = sysrq[0];
+       }
+
+       r = kvm_ipc__send_msg(sock, KVM_IPC_DEBUG, sizeof(cmd), (u8 *)&cmd);
+       if (r < 0)
+               return r;
+
+       if (!dump)
+               return 0;
+
+       do {
+               r = xread(sock, buff, BUFFER_SIZE);
+               if (r < 0)
+                       return 0;
+               printf("%.*s", r, buff);
+       } while (r > 0);
+
+       return 0;
+}
+
+int kvm_cmd_debug(int argc, const char **argv, const char *prefix)
+{
+       parse_debug_options(argc, argv);
+       int instance;
+       int r;
+
+       if (all)
+               return kvm__enumerate_instances(do_debug);
+
+       if (instance_name == NULL)
+               kvm_debug_help();
+
+       instance = kvm__get_sock_by_instance(instance_name);
+
+       if (instance <= 0)
+               die("Failed locating instance");
+
+       r = do_debug(instance_name, instance);
+
+       close(instance);
+
+       return r;
+}
diff --git a/tools/kvm/builtin-help.c b/tools/kvm/builtin-help.c

new file mode 100644 (file)

index 0000000..5970fb7
--- /dev/null
+++ b/tools/kvm/builtin-help.c
@@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <string.h>
+
+/* user defined headers */
+#include <common-cmds.h>
+
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-help.h>
+#include <kvm/kvm.h>
+
+
+const char kvm_usage_string[] =
+       "lkvm COMMAND [ARGS]";
+
+const char kvm_more_info_string[] =
+       "See 'lkvm help COMMAND' for more information on a specific command.";
+
+
+static void list_common_cmds_help(void)
+{
+       unsigned int i, longest = 0;
+
+       for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+               if (longest < strlen(common_cmds[i].name))
+                       longest = strlen(common_cmds[i].name);
+       }
+
+       puts(" The most commonly used lkvm commands are:");
+       for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+               printf("   %-*s   ", longest, common_cmds[i].name);
+               puts(common_cmds[i].help);
+       }
+}
+
+static void kvm_help(void)
+{
+       printf("\n To start a simple non-privileged shell run '%s run'\n\n"
+               "usage: %s\n\n", KVM_BINARY_NAME, kvm_usage_string);
+       list_common_cmds_help();
+       printf("\n %s\n\n", kvm_more_info_string);
+}
+
+
+static void help_cmd(const char *cmd)
+{
+       struct cmd_struct *p;
+       p = kvm_get_command(kvm_commands, cmd);
+       if (!p)
+               kvm_help();
+       else if (p->help)
+               p->help();
+}
+
+int kvm_cmd_help(int argc, const char **argv, const char *prefix)
+{
+       if (!argv || !*argv) {
+               kvm_help();
+               return 0;
+       }
+       help_cmd(argv[0]);
+       return 0;
+}
diff --git a/tools/kvm/builtin-list.c b/tools/kvm/builtin-list.c

new file mode 100644 (file)

index 0000000..9299f17
--- /dev/null
+++ b/tools/kvm/builtin-list.c
@@ -0,0 +1,149 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-list.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <dirent.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <fcntl.h>
+
+static bool run;
+static bool rootfs;
+
+static const char * const list_usage[] = {
+       "lkvm list",
+       NULL
+};
+
+static const struct option list_options[] = {
+       OPT_GROUP("General options:"),
+       OPT_BOOLEAN('i', "run", &run, "List running instances"),
+       OPT_BOOLEAN('r', "rootfs", &rootfs, "List rootfs instances"),
+       OPT_END()
+};
+
+#define KVM_INSTANCE_RUNNING   "running"
+#define KVM_INSTANCE_PAUSED    "paused"
+#define KVM_INSTANCE_SHUTOFF   "shut off"
+
+void kvm_list_help(void)
+{
+       usage_with_options(list_usage, list_options);
+}
+
+static pid_t get_pid(int sock)
+{
+       pid_t pid;
+       int r;
+
+       r = kvm_ipc__send(sock, KVM_IPC_PID);
+       if (r < 0)
+               return r;
+
+       r = read(sock, &pid, sizeof(pid));
+       if (r < 0)
+               return r;
+
+       return pid;
+}
+
+int get_vmstate(int sock)
+{
+       int vmstate;
+       int r;
+
+       r = kvm_ipc__send(sock, KVM_IPC_VMSTATE);
+       if (r < 0)
+               return r;
+
+       r = read(sock, &vmstate, sizeof(vmstate));
+       if (r < 0)
+               return r;
+
+       return vmstate;
+
+}
+
+static int print_guest(const char *name, int sock)
+{
+       pid_t pid;
+       int vmstate;
+
+       pid = get_pid(sock);
+       vmstate = get_vmstate(sock);
+
+       if ((int)pid < 0 || vmstate < 0)
+               return -1;
+
+       if (vmstate == KVM_VMSTATE_PAUSED)
+               printf("%5d %-20s %s\n", pid, name, KVM_INSTANCE_PAUSED);
+       else
+               printf("%5d %-20s %s\n", pid, name, KVM_INSTANCE_RUNNING);
+
+       return 0;
+}
+
+static int kvm_list_running_instances(void)
+{
+       return kvm__enumerate_instances(print_guest);
+}
+
+static int kvm_list_rootfs(void)
+{
+       DIR *dir;
+       struct dirent *dirent;
+
+       dir = opendir(kvm__get_dir());
+       if (dir == NULL)
+               return -1;
+
+       while ((dirent = readdir(dir))) {
+               if (dirent->d_type == DT_DIR &&
+                       strcmp(dirent->d_name, ".") &&
+                       strcmp(dirent->d_name, ".."))
+                       printf("%5s %-20s %s\n", "", dirent->d_name, KVM_INSTANCE_SHUTOFF);
+       }
+
+       return 0;
+}
+
+static void parse_setup_options(int argc, const char **argv)
+{
+       while (argc != 0) {
+               argc = parse_options(argc, argv, list_options, list_usage,
+                               PARSE_OPT_STOP_AT_NON_OPTION);
+               if (argc != 0)
+                       kvm_list_help();
+       }
+}
+
+int kvm_cmd_list(int argc, const char **argv, const char *prefix)
+{
+       int r;
+
+       parse_setup_options(argc, argv);
+
+       if (!run && !rootfs)
+               run = rootfs = true;
+
+       printf("%6s %-20s %s\n", "PID", "NAME", "STATE");
+       printf("------------------------------------\n");
+
+       if (run) {
+               r = kvm_list_running_instances();
+               if (r < 0)
+                       perror("Error listing instances");
+       }
+
+       if (rootfs) {
+               r = kvm_list_rootfs();
+               if (r < 0)
+                       perror("Error listing rootfs");
+       }
+
+       return 0;
+}
diff --git a/tools/kvm/builtin-pause.c b/tools/kvm/builtin-pause.c

new file mode 100644 (file)

index 0000000..c08595a
--- /dev/null
+++ b/tools/kvm/builtin-pause.c
@@ -0,0 +1,88 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-pause.h>
+#include <kvm/builtin-list.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool all;
+static const char *instance_name;
+
+static const char * const pause_usage[] = {
+       "lkvm pause [--all] [-n name]",
+       NULL
+};
+
+static const struct option pause_options[] = {
+       OPT_GROUP("General options:"),
+       OPT_BOOLEAN('a', "all", &all, "Pause all instances"),
+       OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+       OPT_END()
+};
+
+static void parse_pause_options(int argc, const char **argv)
+{
+       while (argc != 0) {
+               argc = parse_options(argc, argv, pause_options, pause_usage,
+                               PARSE_OPT_STOP_AT_NON_OPTION);
+               if (argc != 0)
+                       kvm_pause_help();
+       }
+}
+
+void kvm_pause_help(void)
+{
+       usage_with_options(pause_usage, pause_options);
+}
+
+static int do_pause(const char *name, int sock)
+{
+       int r;
+       int vmstate;
+
+       vmstate = get_vmstate(sock);
+       if (vmstate < 0)
+               return vmstate;
+       if (vmstate == KVM_VMSTATE_PAUSED) {
+               printf("Guest %s is already paused.\n", name);
+               return 0;
+       }
+
+       r = kvm_ipc__send(sock, KVM_IPC_PAUSE);
+       if (r)
+               return r;
+
+       printf("Guest %s paused\n", name);
+
+       return 0;
+}
+
+int kvm_cmd_pause(int argc, const char **argv, const char *prefix)
+{
+       int instance;
+       int r;
+
+       parse_pause_options(argc, argv);
+
+       if (all)
+               return kvm__enumerate_instances(do_pause);
+
+       if (instance_name == NULL)
+               kvm_pause_help();
+
+       instance = kvm__get_sock_by_instance(instance_name);
+
+       if (instance <= 0)
+               die("Failed locating instance");
+
+       r = do_pause(instance_name, instance);
+
+       close(instance);
+
+       return r;
+}
diff --git a/tools/kvm/builtin-resume.c b/tools/kvm/builtin-resume.c

new file mode 100644 (file)

index 0000000..0e954b4
--- /dev/null
+++ b/tools/kvm/builtin-resume.c
@@ -0,0 +1,88 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-resume.h>
+#include <kvm/builtin-list.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool all;
+static const char *instance_name;
+
+static const char * const resume_usage[] = {
+       "lkvm resume [--all] [-n name]",
+       NULL
+};
+
+static const struct option resume_options[] = {
+       OPT_GROUP("General options:"),
+       OPT_BOOLEAN('a', "all", &all, "Resume all instances"),
+       OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+       OPT_END()
+};
+
+static void parse_resume_options(int argc, const char **argv)
+{
+       while (argc != 0) {
+               argc = parse_options(argc, argv, resume_options, resume_usage,
+                               PARSE_OPT_STOP_AT_NON_OPTION);
+               if (argc != 0)
+                       kvm_resume_help();
+       }
+}
+
+void kvm_resume_help(void)
+{
+       usage_with_options(resume_usage, resume_options);
+}
+
+static int do_resume(const char *name, int sock)
+{
+       int r;
+       int vmstate;
+
+       vmstate = get_vmstate(sock);
+       if (vmstate < 0)
+               return vmstate;
+       if (vmstate == KVM_VMSTATE_RUNNING) {
+               printf("Guest %s is still running.\n", name);
+               return 0;
+       }
+
+       r = kvm_ipc__send(sock, KVM_IPC_RESUME);
+       if (r)
+               return r;
+
+       printf("Guest %s resumed\n", name);
+
+       return 0;
+}
+
+int kvm_cmd_resume(int argc, const char **argv, const char *prefix)
+{
+       int instance;
+       int r;
+
+       parse_resume_options(argc, argv);
+
+       if (all)
+               return kvm__enumerate_instances(do_resume);
+
+       if (instance_name == NULL)
+               kvm_resume_help();
+
+       instance = kvm__get_sock_by_instance(instance_name);
+
+       if (instance <= 0)
+               die("Failed locating instance");
+
+       r = do_resume(instance_name, instance);
+
+       close(instance);
+
+       return r;
+}
diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c

new file mode 100644 (file)

index 0000000..0412e58
--- /dev/null
+++ b/tools/kvm/builtin-run.c
@@ -0,0 +1,1426 @@
+#include "kvm/builtin-run.h"
+
+#include "kvm/builtin-setup.h"
+#include "kvm/virtio-balloon.h"
+#include "kvm/virtio-console.h"
+#include "kvm/parse-options.h"
+#include "kvm/8250-serial.h"
+#include "kvm/framebuffer.h"
+#include "kvm/disk-image.h"
+#include "kvm/threadpool.h"
+#include "kvm/virtio-scsi.h"
+#include "kvm/virtio-blk.h"
+#include "kvm/virtio-net.h"
+#include "kvm/virtio-rng.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/virtio-9p.h"
+#include "kvm/barrier.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/ioport.h"
+#include "kvm/symbol.h"
+#include "kvm/i8042.h"
+#include "kvm/mutex.h"
+#include "kvm/term.h"
+#include "kvm/util.h"
+#include "kvm/strbuf.h"
+#include "kvm/vesa.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/rtc.h"
+#include "kvm/sdl.h"
+#include "kvm/vnc.h"
+#include "kvm/guest_compat.h"
+#include "kvm/pci-shmem.h"
+#include "kvm/kvm-ipc.h"
+#include "kvm/builtin-debug.h"
+
+#include <linux/types.h>
+#include <linux/err.h>
+
+#include <sys/utsname.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <termios.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <stdio.h>
+
+#define DEFAULT_KVM_DEV                "/dev/kvm"
+#define DEFAULT_CONSOLE                "serial"
+#define DEFAULT_NETWORK                "user"
+#define DEFAULT_HOST_ADDR      "192.168.33.1"
+#define DEFAULT_GUEST_ADDR     "192.168.33.15"
+#define DEFAULT_GUEST_MAC      "02:15:15:15:15:15"
+#define DEFAULT_HOST_MAC       "02:01:01:01:01:01"
+#define DEFAULT_SCRIPT         "none"
+const char *DEFAULT_SANDBOX_FILENAME = "guest/sandbox.sh";
+
+#define MB_SHIFT               (20)
+#define KB_SHIFT               (10)
+#define GB_SHIFT               (30)
+#define MIN_RAM_SIZE_MB                (64ULL)
+#define MIN_RAM_SIZE_BYTE      (MIN_RAM_SIZE_MB << MB_SHIFT)
+
+struct kvm *kvm;
+struct kvm_cpu **kvm_cpus;
+__thread struct kvm_cpu *current_kvm_cpu;
+
+static struct disk_image_params disk_image[MAX_DISK_IMAGES];
+static u64 ram_size;
+static u8  image_count;
+static u8 num_net_devices;
+static bool virtio_rng;
+static const char *kernel_cmdline;
+static const char *kernel_filename;
+static const char *vmlinux_filename;
+static const char *initrd_filename;
+static const char *firmware_filename;
+static const char *console;
+static const char *dev;
+static const char *network;
+static const char *host_ip;
+static const char *guest_ip;
+static const char *guest_mac;
+static const char *host_mac;
+static const char *script;
+static const char *guest_name;
+static const char *sandbox;
+static const char *hugetlbfs_path;
+static const char *custom_rootfs_name = "default";
+static struct virtio_net_params *net_params;
+static bool single_step;
+static bool vnc;
+static bool sdl;
+static bool balloon;
+static bool using_rootfs;
+static bool custom_rootfs;
+static bool no_net;
+static bool no_dhcp;
+extern bool ioport_debug;
+extern bool mmio_debug;
+static int  kvm_run_wrapper;
+extern int  active_console;
+extern int  debug_iodelay;
+
+bool do_debug_print = false;
+
+static int nrcpus;
+static int vidmode = -1;
+
+extern char _binary_guest_init_start;
+extern char _binary_guest_init_size;
+
+static const char * const run_usage[] = {
+       "lkvm run [<options>] [<kernel image>]",
+       NULL
+};
+
+enum {
+       KVM_RUN_DEFAULT,
+       KVM_RUN_SANDBOX,
+};
+
+void kvm_run_set_wrapper_sandbox(void)
+{
+       kvm_run_wrapper = KVM_RUN_SANDBOX;
+}
+
+static int img_name_parser(const struct option *opt, const char *arg, int unset)
+{
+       char path[PATH_MAX];
+       const char *cur;
+       struct stat st;
+       char *sep;
+
+       if (stat(arg, &st) == 0 &&
+           S_ISDIR(st.st_mode)) {
+               char tmp[PATH_MAX];
+
+               if (using_rootfs)
+                       die("Please use only one rootfs directory atmost");
+
+               if (realpath(arg, tmp) == 0 ||
+                   virtio_9p__register(kvm, tmp, "/dev/root") < 0)
+                       die("Unable to initialize virtio 9p");
+               using_rootfs = 1;
+               return 0;
+       }
+
+       snprintf(path, PATH_MAX, "%s%s", kvm__get_dir(), arg);
+
+       if (stat(path, &st) == 0 &&
+           S_ISDIR(st.st_mode)) {
+               char tmp[PATH_MAX];
+
+               if (using_rootfs)
+                       die("Please use only one rootfs directory atmost");
+
+               if (realpath(path, tmp) == 0 ||
+                   virtio_9p__register(kvm, tmp, "/dev/root") < 0)
+                       die("Unable to initialize virtio 9p");
+               if (virtio_9p__register(kvm, "/", "hostfs") < 0)
+                       die("Unable to initialize virtio 9p");
+               kvm_setup_resolv(arg);
+               using_rootfs = custom_rootfs = 1;
+               custom_rootfs_name = arg;
+               return 0;
+       }
+
+       if (image_count >= MAX_DISK_IMAGES)
+               die("Currently only 4 images are supported");
+
+       disk_image[image_count].filename = arg;
+       cur = arg;
+
+       if (strncmp(arg, "scsi:", 5) == 0) {
+               sep = strstr(arg, ":");
+               if (sep)
+                       disk_image[image_count].wwpn = sep + 1;
+               sep = strstr(sep + 1, ":");
+               if (sep) {
+                       *sep = 0;
+                       disk_image[image_count].tpgt = sep + 1;
+               }
+               cur = sep + 1;
+       }
+
+       do {
+               sep = strstr(cur, ",");
+               if (sep) {
+                       if (strncmp(sep + 1, "ro", 2) == 0)
+                               disk_image[image_count].readonly = true;
+                       else if (strncmp(sep + 1, "direct", 6) == 0)
+                               disk_image[image_count].direct = true;
+                       *sep = 0;
+                       cur = sep + 1;
+               }
+       } while (sep);
+
+       image_count++;
+
+       return 0;
+}
+
+static int virtio_9p_rootdir_parser(const struct option *opt, const char *arg, int unset)
+{
+       char *tag_name;
+       char tmp[PATH_MAX];
+
+       /*
+        * 9p dir can be of the form dirname,tag_name or
+        * just dirname. In the later case we use the
+        * default tag name
+        */
+       tag_name = strstr(arg, ",");
+       if (tag_name) {
+               *tag_name = '\0';
+               tag_name++;
+       }
+       if (realpath(arg, tmp)) {
+               if (virtio_9p__register(kvm, tmp, tag_name) < 0)
+                       die("Unable to initialize virtio 9p");
+       } else
+               die("Failed resolving 9p path");
+       return 0;
+}
+
+static int tty_parser(const struct option *opt, const char *arg, int unset)
+{
+       int tty = atoi(arg);
+
+       term_set_tty(tty);
+
+       return 0;
+}
+
+static inline void str_to_mac(const char *str, char *mac)
+{
+       sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
+               mac, mac+1, mac+2, mac+3, mac+4, mac+5);
+}
+static int set_net_param(struct virtio_net_params *p, const char *param,
+                               const char *val)
+{
+       if (strcmp(param, "guest_mac") == 0) {
+               str_to_mac(val, p->guest_mac);
+       } else if (strcmp(param, "mode") == 0) {
+               if (!strncmp(val, "user", 4)) {
+                       int i;
+
+                       for (i = 0; i < num_net_devices; i++)
+                               if (net_params[i].mode == NET_MODE_USER)
+                                       die("Only one usermode network device allowed at a time");
+                       p->mode = NET_MODE_USER;
+               } else if (!strncmp(val, "tap", 3)) {
+                       p->mode = NET_MODE_TAP;
+               } else if (!strncmp(val, "none", 4)) {
+                       no_net = 1;
+                       return -1;
+               } else
+                       die("Unkown network mode %s, please use user, tap or none", network);
+       } else if (strcmp(param, "script") == 0) {
+               p->script = strdup(val);
+       } else if (strcmp(param, "guest_ip") == 0) {
+               p->guest_ip = strdup(val);
+       } else if (strcmp(param, "host_ip") == 0) {
+               p->host_ip = strdup(val);
+       } else if (strcmp(param, "trans") == 0) {
+               p->trans = strdup(val);
+       } else if (strcmp(param, "vhost") == 0) {
+               p->vhost = atoi(val);
+       } else if (strcmp(param, "fd") == 0) {
+               p->fd = atoi(val);
+       } else
+               die("Unknown network parameter %s", param);
+
+       return 0;
+}
+
+static int netdev_parser(const struct option *opt, const char *arg, int unset)
+{
+       struct virtio_net_params p;
+       char *buf = NULL, *cmd = NULL, *cur = NULL;
+       bool on_cmd = true;
+
+       if (arg) {
+               buf = strdup(arg);
+               if (buf == NULL)
+                       die("Failed allocating new net buffer");
+               cur = strtok(buf, ",=");
+       }
+
+       p = (struct virtio_net_params) {
+               .guest_ip       = DEFAULT_GUEST_ADDR,
+               .host_ip        = DEFAULT_HOST_ADDR,
+               .script         = DEFAULT_SCRIPT,
+               .mode           = NET_MODE_TAP,
+       };
+
+       str_to_mac(DEFAULT_GUEST_MAC, p.guest_mac);
+       p.guest_mac[5] += num_net_devices;
+
+       while (cur) {
+               if (on_cmd) {
+                       cmd = cur;
+               } else {
+                       if (set_net_param(&p, cmd, cur) < 0)
+                               goto done;
+               }
+               on_cmd = !on_cmd;
+
+               cur = strtok(NULL, ",=");
+       };
+
+       num_net_devices++;
+
+       net_params = realloc(net_params, num_net_devices * sizeof(*net_params));
+       if (net_params == NULL)
+               die("Failed adding new network device");
+
+       net_params[num_net_devices - 1] = p;
+
+done:
+       free(buf);
+       return 0;
+}
+
+static int shmem_parser(const struct option *opt, const char *arg, int unset)
+{
+       const u64 default_size = SHMEM_DEFAULT_SIZE;
+       const u64 default_phys_addr = SHMEM_DEFAULT_ADDR;
+       const char *default_handle = SHMEM_DEFAULT_HANDLE;
+       struct shmem_info *si = malloc(sizeof(struct shmem_info));
+       u64 phys_addr;
+       u64 size;
+       char *handle = NULL;
+       int create = 0;
+       const char *p = arg;
+       char *next;
+       int base = 10;
+       int verbose = 0;
+
+       const int skip_pci = strlen("pci:");
+       if (verbose)
+               pr_info("shmem_parser(%p,%s,%d)", opt, arg, unset);
+       /* parse out optional addr family */
+       if (strcasestr(p, "pci:")) {
+               p += skip_pci;
+       } else if (strcasestr(p, "mem:")) {
+               die("I can't add to E820 map yet.\n");
+       }
+       /* parse out physical addr */
+       base = 10;
+       if (strcasestr(p, "0x"))
+               base = 16;
+       phys_addr = strtoll(p, &next, base);
+       if (next == p && phys_addr == 0) {
+               pr_info("shmem: no physical addr specified, using default.");
+               phys_addr = default_phys_addr;
+       }
+       if (*next != ':' && *next != '\0')
+               die("shmem: unexpected chars after phys addr.\n");
+       if (*next == '\0')
+               p = next;
+       else
+               p = next + 1;
+       /* parse out size */
+       base = 10;
+       if (strcasestr(p, "0x"))
+               base = 16;
+       size = strtoll(p, &next, base);
+       if (next == p && size == 0) {
+               pr_info("shmem: no size specified, using default.");
+               size = default_size;
+       }
+       /* look for [KMGkmg][Bb]*  uses base 2. */
+       int skip_B = 0;
+       if (strspn(next, "KMGkmg")) {   /* might have a prefix */
+               if (*(next + 1) == 'B' || *(next + 1) == 'b')
+                       skip_B = 1;
+               switch (*next) {
+               case 'K':
+               case 'k':
+                       size = size << KB_SHIFT;
+                       break;
+               case 'M':
+               case 'm':
+                       size = size << MB_SHIFT;
+                       break;
+               case 'G':
+               case 'g':
+                       size = size << GB_SHIFT;
+                       break;
+               default:
+                       die("shmem: bug in detecting size prefix.");
+                       break;
+               }
+               next += 1 + skip_B;
+       }
+       if (*next != ':' && *next != '\0') {
+               die("shmem: unexpected chars after phys size. <%c><%c>\n",
+                   *next, *p);
+       }
+       if (*next == '\0')
+               p = next;
+       else
+               p = next + 1;
+       /* parse out optional shmem handle */
+       const int skip_handle = strlen("handle=");
+       next = strcasestr(p, "handle=");
+       if (*p && next) {
+               if (p != next)
+                       die("unexpected chars before handle\n");
+               p += skip_handle;
+               next = strchrnul(p, ':');
+               if (next - p) {
+                       handle = malloc(next - p + 1);
+                       strncpy(handle, p, next - p);
+                       handle[next - p] = '\0';        /* just in case. */
+               }
+               if (*next == '\0')
+                       p = next;
+               else
+                       p = next + 1;
+       }
+       /* parse optional create flag to see if we should create shm seg. */
+       if (*p && strcasestr(p, "create")) {
+               create = 1;
+               p += strlen("create");
+       }
+       if (*p != '\0')
+               die("shmem: unexpected trailing chars\n");
+       if (handle == NULL) {
+               handle = malloc(strlen(default_handle) + 1);
+               strcpy(handle, default_handle);
+       }
+       if (verbose) {
+               pr_info("shmem: phys_addr = %llx", phys_addr);
+               pr_info("shmem: size      = %llx", size);
+               pr_info("shmem: handle    = %s", handle);
+               pr_info("shmem: create    = %d", create);
+       }
+
+       si->phys_addr = phys_addr;
+       si->size = size;
+       si->handle = handle;
+       si->create = create;
+       pci_shmem__register_mem(si);    /* ownership of si, etc. passed on. */
+       return 0;
+}
+
+static const struct option options[] = {
+       OPT_GROUP("Basic options:"),
+       OPT_STRING('\0', "name", &guest_name, "guest name",
+                       "A name for the guest"),
+       OPT_INTEGER('c', "cpus", &nrcpus, "Number of CPUs"),
+       OPT_U64('m', "mem", &ram_size, "Virtual machine memory size in MiB."),
+       OPT_CALLBACK('\0', "shmem", NULL,
+                    "[pci:]<addr>:<size>[:handle=<handle>][:create]",
+                    "Share host shmem with guest via pci device",
+                    shmem_parser),
+       OPT_CALLBACK('d', "disk", NULL, "image or rootfs_dir", "Disk image or rootfs directory", img_name_parser),
+       OPT_BOOLEAN('\0', "balloon", &balloon, "Enable virtio balloon"),
+       OPT_BOOLEAN('\0', "vnc", &vnc, "Enable VNC framebuffer"),
+       OPT_BOOLEAN('\0', "sdl", &sdl, "Enable SDL framebuffer"),
+       OPT_BOOLEAN('\0', "rng", &virtio_rng, "Enable virtio Random Number Generator"),
+       OPT_CALLBACK('\0', "9p", NULL, "dir_to_share,tag_name",
+                    "Enable virtio 9p to share files between host and guest", virtio_9p_rootdir_parser),
+       OPT_STRING('\0', "console", &console, "serial, virtio or hv",
+                       "Console to use"),
+       OPT_STRING('\0', "dev", &dev, "device_file", "KVM device file"),
+       OPT_CALLBACK('\0', "tty", NULL, "tty id",
+                    "Remap guest TTY into a pty on the host",
+                    tty_parser),
+       OPT_STRING('\0', "sandbox", &sandbox, "script",
+                       "Run this script when booting into custom rootfs"),
+       OPT_STRING('\0', "hugetlbfs", &hugetlbfs_path, "path", "Hugetlbfs path"),
+
+       OPT_GROUP("Kernel options:"),
+       OPT_STRING('k', "kernel", &kernel_filename, "kernel",
+                       "Kernel to boot in virtual machine"),
+       OPT_STRING('i', "initrd", &initrd_filename, "initrd",
+                       "Initial RAM disk image"),
+       OPT_STRING('p', "params", &kernel_cmdline, "params",
+                       "Kernel command line arguments"),
+       OPT_STRING('f', "firmware", &firmware_filename, "firmware",
+                       "Firmware image to boot in virtual machine"),
+
+       OPT_GROUP("Networking options:"),
+       OPT_CALLBACK_DEFAULT('n', "network", NULL, "network params",
+                    "Create a new guest NIC",
+                    netdev_parser, NULL),
+       OPT_BOOLEAN('\0', "no-dhcp", &no_dhcp, "Disable kernel DHCP in rootfs mode"),
+
+       OPT_GROUP("BIOS options:"),
+       OPT_INTEGER('\0', "vidmode", &vidmode,
+                   "Video mode"),
+
+       OPT_GROUP("Debug options:"),
+       OPT_BOOLEAN('\0', "debug", &do_debug_print,
+                       "Enable debug messages"),
+       OPT_BOOLEAN('\0', "debug-single-step", &single_step,
+                       "Enable single stepping"),
+       OPT_BOOLEAN('\0', "debug-ioport", &ioport_debug,
+                       "Enable ioport debugging"),
+       OPT_BOOLEAN('\0', "debug-mmio", &mmio_debug,
+                       "Enable MMIO debugging"),
+       OPT_INTEGER('\0', "debug-iodelay", &debug_iodelay,
+                       "Delay IO by millisecond"),
+       OPT_END()
+};
+
+/*
+ * Serialize debug printout so that the output of multiple vcpus does not
+ * get mixed up:
+ */
+static int printout_done;
+
+static void handle_sigusr1(int sig)
+{
+       struct kvm_cpu *cpu = current_kvm_cpu;
+       int fd = kvm_cpu__get_debug_fd();
+
+       if (!cpu || cpu->needs_nmi)
+               return;
+
+       dprintf(fd, "\n #\n # vCPU #%ld's dump:\n #\n", cpu->cpu_id);
+       kvm_cpu__show_registers(cpu);
+       kvm_cpu__show_code(cpu);
+       kvm_cpu__show_page_tables(cpu);
+       fflush(stdout);
+       printout_done = 1;
+       mb();
+}
+
+/* Pause/resume the guest using SIGUSR2 */
+static int is_paused;
+
+static void handle_pause(int fd, u32 type, u32 len, u8 *msg)
+{
+       if (WARN_ON(len))
+               return;
+
+       if (type == KVM_IPC_RESUME && is_paused) {
+               kvm->vm_state = KVM_VMSTATE_RUNNING;
+               kvm__continue();
+       } else if (type == KVM_IPC_PAUSE && !is_paused) {
+               kvm->vm_state = KVM_VMSTATE_PAUSED;
+               ioctl(kvm->vm_fd, KVM_KVMCLOCK_CTRL);
+               kvm__pause();
+       } else {
+               return;
+       }
+
+       is_paused = !is_paused;
+}
+
+static void handle_vmstate(int fd, u32 type, u32 len, u8 *msg)
+{
+       int r = 0;
+
+       if (type == KVM_IPC_VMSTATE)
+               r = write(fd, &kvm->vm_state, sizeof(kvm->vm_state));
+
+       if (r < 0)
+               pr_warning("Failed sending VMSTATE");
+}
+
+static void handle_debug(int fd, u32 type, u32 len, u8 *msg)
+{
+       int i;
+       struct debug_cmd_params *params;
+       u32 dbg_type;
+       u32 vcpu;
+
+       if (WARN_ON(type != KVM_IPC_DEBUG || len != sizeof(*params)))
+               return;
+
+       params = (void *)msg;
+       dbg_type = params->dbg_type;
+       vcpu = params->cpu;
+
+       if (dbg_type & KVM_DEBUG_CMD_TYPE_SYSRQ)
+               serial8250__inject_sysrq(kvm, params->sysrq);
+
+       if (dbg_type & KVM_DEBUG_CMD_TYPE_NMI) {
+               if ((int)vcpu >= kvm->nrcpus)
+                       return;
+
+               kvm_cpus[vcpu]->needs_nmi = 1;
+               pthread_kill(kvm_cpus[vcpu]->thread, SIGUSR1);
+       }
+
+       if (!(dbg_type & KVM_DEBUG_CMD_TYPE_DUMP))
+               return;
+
+       for (i = 0; i < nrcpus; i++) {
+               struct kvm_cpu *cpu = kvm_cpus[i];
+
+               if (!cpu)
+                       continue;
+
+               printout_done = 0;
+
+               kvm_cpu__set_debug_fd(fd);
+               pthread_kill(cpu->thread, SIGUSR1);
+               /*
+                * Wait for the vCPU to dump state before signalling
+                * the next thread. Since this is debug code it does
+                * not matter that we are burning CPU time a bit:
+                */
+               while (!printout_done)
+                       mb();
+       }
+
+       close(fd);
+
+       serial8250__inject_sysrq(kvm, 'p');
+}
+
+static void handle_sigalrm(int sig)
+{
+       kvm__arch_periodic_poll(kvm);
+}
+
+static void handle_stop(int fd, u32 type, u32 len, u8 *msg)
+{
+       if (WARN_ON(type != KVM_IPC_STOP || len))
+               return;
+
+       kvm_cpu__reboot();
+}
+
+static void *kvm_cpu_thread(void *arg)
+{
+       current_kvm_cpu         = arg;
+
+       if (kvm_cpu__start(current_kvm_cpu))
+               goto panic_kvm;
+
+       return (void *) (intptr_t) 0;
+
+panic_kvm:
+       fprintf(stderr, "KVM exit reason: %u (\"%s\")\n",
+               current_kvm_cpu->kvm_run->exit_reason,
+               kvm_exit_reasons[current_kvm_cpu->kvm_run->exit_reason]);
+       if (current_kvm_cpu->kvm_run->exit_reason == KVM_EXIT_UNKNOWN)
+               fprintf(stderr, "KVM exit code: 0x%Lu\n",
+                       current_kvm_cpu->kvm_run->hw.hardware_exit_reason);
+
+       kvm_cpu__set_debug_fd(STDOUT_FILENO);
+       kvm_cpu__show_registers(current_kvm_cpu);
+       kvm_cpu__show_code(current_kvm_cpu);
+       kvm_cpu__show_page_tables(current_kvm_cpu);
+
+       return (void *) (intptr_t) 1;
+}
+
+static char kernel[PATH_MAX];
+
+static const char *host_kernels[] = {
+       "/boot/vmlinuz",
+       "/boot/bzImage",
+       NULL
+};
+
+static const char *default_kernels[] = {
+       "./bzImage",
+       "arch/" BUILD_ARCH "/boot/bzImage",
+       "../../arch/" BUILD_ARCH "/boot/bzImage",
+       NULL
+};
+
+static const char *default_vmlinux[] = {
+       "vmlinux",
+       "../../../vmlinux",
+       "../../vmlinux",
+       NULL
+};
+
+static void kernel_usage_with_options(void)
+{
+       const char **k;
+       struct utsname uts;
+
+       fprintf(stderr, "Fatal: could not find default kernel image in:\n");
+       k = &default_kernels[0];
+       while (*k) {
+               fprintf(stderr, "\t%s\n", *k);
+               k++;
+       }
+
+       if (uname(&uts) < 0)
+               return;
+
+       k = &host_kernels[0];
+       while (*k) {
+               if (snprintf(kernel, PATH_MAX, "%s-%s", *k, uts.release) < 0)
+                       return;
+               fprintf(stderr, "\t%s\n", kernel);
+               k++;
+       }
+       fprintf(stderr, "\nPlease see '%s run --help' for more options.\n\n",
+               KVM_BINARY_NAME);
+}
+
+static u64 host_ram_size(void)
+{
+       long page_size;
+       long nr_pages;
+
+       nr_pages        = sysconf(_SC_PHYS_PAGES);
+       if (nr_pages < 0) {
+               pr_warning("sysconf(_SC_PHYS_PAGES) failed");
+               return 0;
+       }
+
+       page_size       = sysconf(_SC_PAGE_SIZE);
+       if (page_size < 0) {
+               pr_warning("sysconf(_SC_PAGE_SIZE) failed");
+               return 0;
+       }
+
+       return (nr_pages * page_size) >> MB_SHIFT;
+}
+
+/*
+ * If user didn't specify how much memory it wants to allocate for the guest,
+ * avoid filling the whole host RAM.
+ */
+#define RAM_SIZE_RATIO         0.8
+
+static u64 get_ram_size(int nr_cpus)
+{
+       u64 available;
+       u64 ram_size;
+
+       ram_size        = 64 * (nr_cpus + 3);
+
+       available       = host_ram_size() * RAM_SIZE_RATIO;
+       if (!available)
+               available = MIN_RAM_SIZE_MB;
+
+       if (ram_size > available)
+               ram_size        = available;
+
+       return ram_size;
+}
+
+static const char *find_kernel(void)
+{
+       const char **k;
+       struct stat st;
+       struct utsname uts;
+
+       k = &default_kernels[0];
+       while (*k) {
+               if (stat(*k, &st) < 0 || !S_ISREG(st.st_mode)) {
+                       k++;
+                       continue;
+               }
+               strncpy(kernel, *k, PATH_MAX);
+               return kernel;
+       }
+
+       if (uname(&uts) < 0)
+               return NULL;
+
+       k = &host_kernels[0];
+       while (*k) {
+               if (snprintf(kernel, PATH_MAX, "%s-%s", *k, uts.release) < 0)
+                       return NULL;
+
+               if (stat(kernel, &st) < 0 || !S_ISREG(st.st_mode)) {
+                       k++;
+                       continue;
+               }
+               return kernel;
+
+       }
+       return NULL;
+}
+
+static const char *find_vmlinux(void)
+{
+       const char **vmlinux;
+
+       vmlinux = &default_vmlinux[0];
+       while (*vmlinux) {
+               struct stat st;
+
+               if (stat(*vmlinux, &st) < 0 || !S_ISREG(st.st_mode)) {
+                       vmlinux++;
+                       continue;
+               }
+               return *vmlinux;
+       }
+       return NULL;
+}
+
+void kvm_run_help(void)
+{
+       usage_with_options(run_usage, options);
+}
+
+static int kvm_setup_guest_init(void)
+{
+       const char *rootfs = custom_rootfs_name;
+       char tmp[PATH_MAX];
+       size_t size;
+       int fd, ret;
+       char *data;
+
+       /* Setup /virt/init */
+       size = (size_t)&_binary_guest_init_size;
+       data = (char *)&_binary_guest_init_start;
+       snprintf(tmp, PATH_MAX, "%s%s/virt/init", kvm__get_dir(), rootfs);
+       remove(tmp);
+       fd = open(tmp, O_CREAT | O_WRONLY, 0755);
+       if (fd < 0)
+               die("Fail to setup %s", tmp);
+       ret = xwrite(fd, data, size);
+       if (ret < 0)
+               die("Fail to setup %s", tmp);
+       close(fd);
+
+       return 0;
+}
+
+static int kvm_run_set_sandbox(void)
+{
+       const char *guestfs_name = custom_rootfs_name;
+       char path[PATH_MAX], script[PATH_MAX], *tmp;
+
+       snprintf(path, PATH_MAX, "%s%s/virt/sandbox.sh", kvm__get_dir(), guestfs_name);
+
+       remove(path);
+
+       if (sandbox == NULL)
+               return 0;
+
+       tmp = realpath(sandbox, NULL);
+       if (tmp == NULL)
+               return -ENOMEM;
+
+       snprintf(script, PATH_MAX, "/host/%s", tmp);
+       free(tmp);
+
+       return symlink(script, path);
+}
+
+static void kvm_write_sandbox_cmd_exactly(int fd, const char *arg)
+{
+       const char *single_quote;
+
+       if (!*arg) { /* zero length string */
+               if (write(fd, "''", 2) <= 0)
+                       die("Failed writing sandbox script");
+               return;
+       }
+
+       while (*arg) {
+               single_quote = strchrnul(arg, '\'');
+
+               /* write non-single-quote string as #('string') */
+               if (arg != single_quote) {
+                       if (write(fd, "'", 1) <= 0 ||
+                           write(fd, arg, single_quote - arg) <= 0 ||
+                           write(fd, "'", 1) <= 0)
+                               die("Failed writing sandbox script");
+               }
+
+               /* write single quote as #("'") */
+               if (*single_quote) {
+                       if (write(fd, "\"'\"", 3) <= 0)
+                               die("Failed writing sandbox script");
+               } else
+                       break;
+
+               arg = single_quote + 1;
+       }
+}
+
+static void resolve_program(const char *src, char *dst, size_t len)
+{
+       struct stat st;
+       int err;
+
+       err = stat(src, &st);
+
+       if (!err && S_ISREG(st.st_mode)) {
+               char resolved_path[PATH_MAX];
+
+               if (!realpath(src, resolved_path))
+                       die("Unable to resolve program %s: %s\n", src, strerror(errno));
+
+               snprintf(dst, len, "/host%s", resolved_path);
+       } else
+               strncpy(dst, src, len);
+}
+
+static void kvm_run_write_sandbox_cmd(const char **argv, int argc)
+{
+       const char script_hdr[] = "#! /bin/bash\n\n";
+       char program[PATH_MAX];
+       int fd;
+
+       remove(sandbox);
+
+       fd = open(sandbox, O_RDWR | O_CREAT, 0777);
+       if (fd < 0)
+               die("Failed creating sandbox script");
+
+       if (write(fd, script_hdr, sizeof(script_hdr) - 1) <= 0)
+               die("Failed writing sandbox script");
+
+       resolve_program(argv[0], program, PATH_MAX);
+       kvm_write_sandbox_cmd_exactly(fd, program);
+
+       argv++;
+       argc--;
+
+       while (argc) {
+               if (write(fd, " ", 1) <= 0)
+                       die("Failed writing sandbox script");
+
+               kvm_write_sandbox_cmd_exactly(fd, argv[0]);
+               argv++;
+               argc--;
+       }
+       if (write(fd, "\n", 1) <= 0)
+               die("Failed writing sandbox script");
+
+       close(fd);
+}
+
+static int kvm_cmd_run_init(int argc, const char **argv)
+{
+       static char real_cmdline[2048], default_name[20];
+       struct framebuffer *fb = NULL;
+       unsigned int nr_online_cpus;
+       int max_cpus, recommended_cpus;
+       int i, r;
+
+       signal(SIGALRM, handle_sigalrm);
+       kvm_ipc__register_handler(KVM_IPC_DEBUG, handle_debug);
+       signal(SIGUSR1, handle_sigusr1);
+       kvm_ipc__register_handler(KVM_IPC_PAUSE, handle_pause);
+       kvm_ipc__register_handler(KVM_IPC_RESUME, handle_pause);
+       kvm_ipc__register_handler(KVM_IPC_STOP, handle_stop);
+       kvm_ipc__register_handler(KVM_IPC_VMSTATE, handle_vmstate);
+
+       nr_online_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+
+       while (argc != 0) {
+               argc = parse_options(argc, argv, options, run_usage,
+                               PARSE_OPT_STOP_AT_NON_OPTION |
+                               PARSE_OPT_KEEP_DASHDASH);
+               if (argc != 0) {
+                       /* Cusrom options, should have been handled elsewhere */
+                       if (strcmp(argv[0], "--") == 0) {
+                               if (kvm_run_wrapper == KVM_RUN_SANDBOX) {
+                                       sandbox = DEFAULT_SANDBOX_FILENAME;
+                                       kvm_run_write_sandbox_cmd(argv+1, argc-1);
+                                       break;
+                               }
+                       }
+
+                       if ((kvm_run_wrapper == KVM_RUN_DEFAULT && kernel_filename) ||
+                               (kvm_run_wrapper == KVM_RUN_SANDBOX && sandbox)) {
+                               fprintf(stderr, "Cannot handle parameter: "
+                                               "%s\n", argv[0]);
+                               usage_with_options(run_usage, options);
+                               return -EINVAL;
+                       }
+                       if (kvm_run_wrapper == KVM_RUN_SANDBOX) {
+                               /*
+                                * first unhandled parameter is treated as
+                                * sandbox command
+                                */
+                               sandbox = DEFAULT_SANDBOX_FILENAME;
+                               kvm_run_write_sandbox_cmd(argv, argc);
+                       } else {
+                               /*
+                                * first unhandled parameter is treated as a kernel
+                                * image
+                                */
+                               kernel_filename = argv[0];
+                       }
+                       argv++;
+                       argc--;
+               }
+
+       }
+
+       if (!kernel_filename)
+               kernel_filename = find_kernel();
+
+       if (!kernel_filename) {
+               kernel_usage_with_options();
+               return -EINVAL;
+       }
+
+       vmlinux_filename = find_vmlinux();
+
+       if (nrcpus == 0)
+               nrcpus = nr_online_cpus;
+
+       if (!ram_size)
+               ram_size        = get_ram_size(nrcpus);
+
+       if (ram_size < MIN_RAM_SIZE_MB)
+               die("Not enough memory specified: %lluMB (min %lluMB)", ram_size, MIN_RAM_SIZE_MB);
+
+       if (ram_size > host_ram_size())
+               pr_warning("Guest memory size %lluMB exceeds host physical RAM size %lluMB", ram_size, host_ram_size());
+
+       ram_size <<= MB_SHIFT;
+
+       if (!dev)
+               dev = DEFAULT_KVM_DEV;
+
+       if (!console)
+               console = DEFAULT_CONSOLE;
+
+       if (!strncmp(console, "virtio", 6))
+               active_console  = CONSOLE_VIRTIO;
+       else if (!strncmp(console, "serial", 6))
+               active_console  = CONSOLE_8250;
+       else if (!strncmp(console, "hv", 2))
+               active_console = CONSOLE_HV;
+       else
+               pr_warning("No console!");
+
+       if (!host_ip)
+               host_ip = DEFAULT_HOST_ADDR;
+
+       if (!guest_ip)
+               guest_ip = DEFAULT_GUEST_ADDR;
+
+       if (!guest_mac)
+               guest_mac = DEFAULT_GUEST_MAC;
+
+       if (!host_mac)
+               host_mac = DEFAULT_HOST_MAC;
+
+       if (!script)
+               script = DEFAULT_SCRIPT;
+
+       term_init();
+
+       if (!guest_name) {
+               if (custom_rootfs) {
+                       guest_name = custom_rootfs_name;
+               } else {
+                       sprintf(default_name, "guest-%u", getpid());
+                       guest_name = default_name;
+               }
+       }
+
+       kvm = kvm__init(dev, hugetlbfs_path, ram_size, guest_name);
+       if (IS_ERR(kvm)) {
+               r = PTR_ERR(kvm);
+               goto fail;
+       }
+
+       kvm->single_step = single_step;
+
+       r = ioeventfd__init(kvm);
+       if (r < 0) {
+               pr_err("ioeventfd__init() failed with error %d\n", r);
+               goto fail;
+       }
+
+       max_cpus = kvm__max_cpus(kvm);
+       recommended_cpus = kvm__recommended_cpus(kvm);
+
+       if (nrcpus > max_cpus) {
+               printf("  # Limit the number of CPUs to %d\n", max_cpus);
+               nrcpus = max_cpus;
+       } else if (nrcpus > recommended_cpus) {
+               printf("  # Warning: The maximum recommended amount of VCPUs"
+                       " is %d\n", recommended_cpus);
+       }
+
+       kvm->nrcpus = nrcpus;
+
+       /* Alloc one pointer too many, so array ends up 0-terminated */
+       kvm_cpus = calloc(nrcpus + 1, sizeof(void *));
+       if (!kvm_cpus)
+               die("Couldn't allocate array for %d CPUs", nrcpus);
+
+       r = irq__init(kvm);
+       if (r < 0) {
+               pr_err("irq__init() failed with error %d\n", r);
+               goto fail;
+       }
+
+       r = pci__init(kvm);
+       if (r < 0) {
+               pr_err("pci__init() failed with error %d\n", r);
+               goto fail;
+       }
+
+       r = ioport__init(kvm);
+       if (r < 0) {
+               pr_err("ioport__init() failed with error %d\n", r);
+               goto fail;
+       }
+
+       /*
+        * vidmode should be either specified
+        * either set by default
+        */
+       if (vnc || sdl) {
+               if (vidmode == -1)
+                       vidmode = 0x312;
+       } else {
+               vidmode = 0;
+       }
+
+       memset(real_cmdline, 0, sizeof(real_cmdline));
+       kvm__arch_set_cmdline(real_cmdline, vnc || sdl);
+
+       if (strlen(real_cmdline) > 0)
+               strcat(real_cmdline, " ");
+
+       if (kernel_cmdline)
+               strlcat(real_cmdline, kernel_cmdline, sizeof(real_cmdline));
+
+       if (!using_rootfs && !disk_image[0].filename && !initrd_filename) {
+               char tmp[PATH_MAX];
+
+               kvm_setup_create_new(custom_rootfs_name);
+               kvm_setup_resolv(custom_rootfs_name);
+
+               snprintf(tmp, PATH_MAX, "%s%s", kvm__get_dir(), "default");
+               if (virtio_9p__register(kvm, tmp, "/dev/root") < 0)
+                       die("Unable to initialize virtio 9p");
+               if (virtio_9p__register(kvm, "/", "hostfs") < 0)
+                       die("Unable to initialize virtio 9p");
+               using_rootfs = custom_rootfs = 1;
+       }
+
+       if (using_rootfs) {
+               strcat(real_cmdline, " root=/dev/root rw rootflags=rw,trans=virtio,version=9p2000.L rootfstype=9p");
+               if (custom_rootfs) {
+                       kvm_run_set_sandbox();
+
+                       strcat(real_cmdline, " init=/virt/init");
+
+                       if (!no_dhcp)
+                               strcat(real_cmdline, "  ip=dhcp");
+                       if (kvm_setup_guest_init())
+                               die("Failed to setup init for guest.");
+               }
+       } else if (!strstr(real_cmdline, "root=")) {
+               strlcat(real_cmdline, " root=/dev/vda rw ", sizeof(real_cmdline));
+       }
+
+       if (image_count) {
+               kvm->nr_disks = image_count;
+               kvm->disks = disk_image__open_all((struct disk_image_params *)&disk_image, image_count);
+               if (IS_ERR(kvm->disks)) {
+                       r = PTR_ERR(kvm->disks);
+                       pr_err("disk_image__open_all() failed with error %ld\n",
+                                       PTR_ERR(kvm->disks));
+                       goto fail;
+               }
+       }
+
+       printf("  # %s run -k %s -m %Lu -c %d --name %s\n", KVM_BINARY_NAME,
+               kernel_filename, ram_size / 1024 / 1024, nrcpus, guest_name);
+
+       if (!firmware_filename) {
+               if (!kvm__load_kernel(kvm, kernel_filename,
+                               initrd_filename, real_cmdline, vidmode))
+                       die("unable to load kernel %s", kernel_filename);
+
+               kvm->vmlinux = vmlinux_filename;
+               r = symbol_init(kvm);
+               if (r < 0)
+                       pr_debug("symbol_init() failed with error %d\n", r);
+       }
+
+       ioport__setup_arch();
+
+       r = rtc__init(kvm);
+       if (r < 0) {
+               pr_err("rtc__init() failed with error %d\n", r);
+               goto fail;
+       }
+
+       r = serial8250__init(kvm);
+       if (r < 0) {
+               pr_err("serial__init() failed with error %d\n", r);
+               goto fail;
+       }
+
+       r = virtio_blk__init(kvm);
+       if (r < 0) {
+               pr_err("virtio_blk__init() failed with error %d\n", r);
+               goto fail;
+       }
+
+       r = virtio_scsi_init(kvm);
+       if (r < 0) {
+               pr_err("virtio_scsi_init() failed with error %d\n", r);
+               goto fail;
+       }
+
+
+       if (active_console == CONSOLE_VIRTIO)
+               virtio_console__init(kvm);
+
+       if (virtio_rng)
+               virtio_rng__init(kvm);
+
+       if (balloon)
+               virtio_bln__init(kvm);
+
+       if (!network)
+               network = DEFAULT_NETWORK;
+
+       virtio_9p__init(kvm);
+
+       for (i = 0; i < num_net_devices; i++) {
+               net_params[i].kvm = kvm;
+               virtio_net__init(&net_params[i]);
+       }
+
+       if (num_net_devices == 0 && no_net == 0) {
+               struct virtio_net_params net_params;
+
+               net_params = (struct virtio_net_params) {
+                       .guest_ip       = guest_ip,
+                       .host_ip        = host_ip,
+                       .kvm            = kvm,
+                       .script         = script,
+                       .mode           = NET_MODE_USER,
+               };
+               str_to_mac(guest_mac, net_params.guest_mac);
+               str_to_mac(host_mac, net_params.host_mac);
+
+               virtio_net__init(&net_params);
+       }
+
+       kvm__init_ram(kvm);
+
+#ifdef CONFIG_X86
+       kbd__init(kvm);
+#endif
+
+       pci_shmem__init(kvm);
+
+       if (vnc || sdl) {
+               fb = vesa__init(kvm);
+               if (IS_ERR(fb)) {
+                       pr_err("vesa__init() failed with error %ld\n", PTR_ERR(fb));
+                       goto fail;
+               }
+       }
+
+       if (vnc && fb) {
+               r = vnc__init(fb);
+               if (r < 0) {
+                       pr_err("vnc__init() failed with error %d\n", r);
+                       goto fail;
+               }
+       }
+
+       if (sdl && fb) {
+               sdl__init(fb);
+               if (r < 0) {
+                       pr_err("sdl__init() failed with error %d\n", r);
+                       goto fail;
+               }
+       }
+
+       r = fb__start();
+       if (r < 0) {
+               pr_err("fb__init() failed with error %d\n", r);
+               goto fail;
+       }
+
+       /* Device init all done; firmware init must
+        * come after this (it may set up device trees etc.)
+        */
+
+       kvm__start_timer(kvm);
+
+       if (firmware_filename) {
+               if (!kvm__load_firmware(kvm, firmware_filename))
+                       die("unable to load firmware image %s: %s", firmware_filename, strerror(errno));
+       } else {
+               kvm__arch_setup_firmware(kvm);
+               if (r < 0) {
+                       pr_err("kvm__arch_setup_firmware() failed with error %d\n", r);
+                       goto fail;
+               }
+       }
+
+       for (i = 0; i < nrcpus; i++) {
+               kvm_cpus[i] = kvm_cpu__init(kvm, i);
+               if (!kvm_cpus[i])
+                       die("unable to initialize KVM VCPU");
+       }
+
+       thread_pool__init(nr_online_cpus);
+fail:
+       return r;
+}
+
+static int kvm_cmd_run_work(void)
+{
+       int i, r = -1;
+       void *ret = NULL;
+
+       for (i = 0; i < nrcpus; i++) {
+               if (pthread_create(&kvm_cpus[i]->thread, NULL, kvm_cpu_thread, kvm_cpus[i]) != 0)
+                       die("unable to create KVM VCPU thread");
+       }
+
+       /* Only VCPU #0 is going to exit by itself when shutting down */
+       if (pthread_join(kvm_cpus[0]->thread, &ret) != 0)
+               r = 0;
+
+       kvm_cpu__delete(kvm_cpus[0]);
+       kvm_cpus[0] = NULL;
+
+       for (i = 1; i < nrcpus; i++) {
+               if (kvm_cpus[i]->is_running) {
+                       pthread_kill(kvm_cpus[i]->thread, SIGKVMEXIT);
+                       if (pthread_join(kvm_cpus[i]->thread, &ret) != 0)
+                               die("pthread_join");
+                       kvm_cpu__delete(kvm_cpus[i]);
+               }
+               if (ret == NULL)
+                       r = 0;
+       }
+
+       return r;
+}
+
+static void kvm_cmd_run_exit(int guest_ret)
+{
+       int r = 0;
+
+       compat__print_all_messages();
+
+       r = symbol_exit(kvm);
+       if (r < 0)
+               pr_warning("symbol_exit() failed with error %d\n", r);
+
+       r = irq__exit(kvm);
+       if (r < 0)
+               pr_warning("irq__exit() failed with error %d\n", r);
+
+       fb__stop();
+
+       r = virtio_scsi_exit(kvm);
+       if (r < 0)
+               pr_warning("virtio_scsi_exit() failed with error %d\n", r);
+
+       r = virtio_blk__exit(kvm);
+       if (r < 0)
+               pr_warning("virtio_blk__exit() failed with error %d\n", r);
+
+       r = virtio_rng__exit(kvm);
+       if (r < 0)
+               pr_warning("virtio_rng__exit() failed with error %d\n", r);
+
+       r = disk_image__close_all(kvm->disks, image_count);
+       if (r < 0)
+               pr_warning("disk_image__close_all() failed with error %d\n", r);
+
+       r = serial8250__exit(kvm);
+       if (r < 0)
+               pr_warning("serial8250__exit() failed with error %d\n", r);
+
+       r = rtc__exit(kvm);
+       if (r < 0)
+               pr_warning("rtc__exit() failed with error %d\n", r);
+
+       r = kvm__arch_free_firmware(kvm);
+       if (r < 0)
+               pr_warning("kvm__arch_free_firmware() failed with error %d\n", r);
+
+       r = ioport__exit(kvm);
+       if (r < 0)
+               pr_warning("ioport__exit() failed with error %d\n", r);
+
+       r = ioeventfd__exit(kvm);
+       if (r < 0)
+               pr_warning("ioeventfd__exit() failed with error %d\n", r);
+
+       r = pci__exit(kvm);
+       if (r < 0)
+               pr_warning("pci__exit() failed with error %d\n", r);
+
+       r = kvm__exit(kvm);
+       if (r < 0)
+               pr_warning("pci__exit() failed with error %d\n", r);
+
+       free(kvm_cpus);
+
+       if (guest_ret == 0)
+               printf("\n  # KVM session ended normally.\n");
+}
+
+int kvm_cmd_run(int argc, const char **argv, const char *prefix)
+{
+       int r, ret = -EFAULT;
+
+       r = kvm_cmd_run_init(argc, argv);
+       if (r < 0)
+               return r;
+
+       ret = kvm_cmd_run_work();
+       kvm_cmd_run_exit(ret);
+
+       return ret;
+}
diff --git a/tools/kvm/builtin-sandbox.c b/tools/kvm/builtin-sandbox.c

new file mode 100644 (file)

index 0000000..433f536
--- /dev/null
+++ b/tools/kvm/builtin-sandbox.c
@@ -0,0 +1,9 @@
+#include "kvm/builtin-sandbox.h"
+#include "kvm/builtin-run.h"
+
+int kvm_cmd_sandbox(int argc, const char **argv, const char *prefix)
+{
+       kvm_run_set_wrapper_sandbox();
+
+       return kvm_cmd_run(argc, argv, prefix);
+}
diff --git a/tools/kvm/builtin-setup.c b/tools/kvm/builtin-setup.c

new file mode 100644 (file)

index 0000000..1b865b7
--- /dev/null
+++ b/tools/kvm/builtin-setup.c
@@ -0,0 +1,262 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-setup.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/read-write.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+extern char _binary_guest_init_start;
+extern char _binary_guest_init_size;
+
+static const char *instance_name;
+
+static const char * const setup_usage[] = {
+       "lkvm setup [name]",
+       NULL
+};
+
+static const struct option setup_options[] = {
+       OPT_END()
+};
+
+static void parse_setup_options(int argc, const char **argv)
+{
+       while (argc != 0) {
+               argc = parse_options(argc, argv, setup_options, setup_usage,
+                               PARSE_OPT_STOP_AT_NON_OPTION);
+               if (argc != 0 && instance_name)
+                       kvm_setup_help();
+               else
+                       instance_name = argv[0];
+               argv++;
+               argc--;
+       }
+}
+
+void kvm_setup_help(void)
+{
+       printf("\n%s setup creates a new rootfs under %s.\n"
+               "This can be used later by the '-d' parameter of '%s run'.\n",
+               KVM_BINARY_NAME, kvm__get_dir(), KVM_BINARY_NAME);
+       usage_with_options(setup_usage, setup_options);
+}
+
+static int copy_file(const char *from, const char *to)
+{
+       int in_fd, out_fd;
+       void *src, *dst;
+       struct stat st;
+       int err = -1;
+
+       in_fd = open(from, O_RDONLY);
+       if (in_fd < 0)
+               return err;
+
+       if (fstat(in_fd, &st) < 0)
+               goto error_close_in;
+
+       out_fd = open(to, O_RDWR | O_CREAT | O_TRUNC, st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO));
+       if (out_fd < 0)
+               goto error_close_in;
+
+       src = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, in_fd, 0);
+       if (src == MAP_FAILED)
+               goto error_close_out;
+
+       if (ftruncate(out_fd, st.st_size) < 0)
+               goto error_munmap_src;
+
+       dst = mmap(NULL, st.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, out_fd, 0);
+       if (dst == MAP_FAILED)
+               goto error_munmap_src;
+
+       memcpy(dst, src, st.st_size);
+
+       if (fsync(out_fd) < 0)
+               goto error_munmap_dst;
+
+       err = 0;
+
+error_munmap_dst:
+       munmap(dst, st.st_size);
+error_munmap_src:
+       munmap(src, st.st_size);
+error_close_out:
+       close(out_fd);
+error_close_in:
+       close(in_fd);
+
+       return err;
+}
+
+static const char *guestfs_dirs[] = {
+       "/dev",
+       "/etc",
+       "/home",
+       "/host",
+       "/proc",
+       "/root",
+       "/sys",
+       "/tmp",
+       "/var",
+       "/var/lib",
+       "/virt",
+       "/virt/home",
+};
+
+static const char *guestfs_symlinks[] = {
+       "/bin",
+       "/lib",
+       "/lib64",
+       "/sbin",
+       "/usr",
+       "/etc/ld.so.conf",
+};
+
+static int copy_init(const char *guestfs_name)
+{
+       char path[PATH_MAX];
+       size_t size;
+       int fd, ret;
+       char *data;
+
+       size = (size_t)&_binary_guest_init_size;
+       data = (char *)&_binary_guest_init_start;
+       snprintf(path, PATH_MAX, "%s%s/virt/init", kvm__get_dir(), guestfs_name);
+       remove(path);
+       fd = open(path, O_CREAT | O_WRONLY, 0755);
+       if (fd < 0)
+               die("Fail to setup %s", path);
+       ret = xwrite(fd, data, size);
+       if (ret < 0)
+               die("Fail to setup %s", path);
+       close(fd);
+
+       return 0;
+}
+
+static int copy_passwd(const char *guestfs_name)
+{
+       char path[PATH_MAX];
+       FILE *file;
+       int ret;
+
+       snprintf(path, PATH_MAX, "%s%s/etc/passwd", kvm__get_dir(), guestfs_name);
+
+       file = fopen(path, "w");
+       if (!file)
+               return -1;
+
+       ret = fprintf(file, "root:x:0:0:root:/root:/bin/sh\n");
+       if (ret < 0)
+               return ret;
+
+       fclose(file);
+
+       return 0;
+}
+
+static int make_guestfs_symlink(const char *guestfs_name, const char *path)
+{
+       char target[PATH_MAX];
+       char name[PATH_MAX];
+
+       snprintf(name, PATH_MAX, "%s%s%s", kvm__get_dir(), guestfs_name, path);
+
+       snprintf(target, PATH_MAX, "/host%s", path);
+
+       return symlink(target, name);
+}
+
+static int make_dir(const char *dir)
+{
+       char name[PATH_MAX];
+
+       snprintf(name, PATH_MAX, "%s%s", kvm__get_dir(), dir);
+
+       return mkdir(name, 0777);
+}
+
+static void make_guestfs_dir(const char *guestfs_name, const char *dir)
+{
+       char name[PATH_MAX];
+
+       snprintf(name, PATH_MAX, "%s%s", guestfs_name, dir);
+
+       make_dir(name);
+}
+
+void kvm_setup_resolv(const char *guestfs_name)
+{
+       char path[PATH_MAX];
+
+       snprintf(path, PATH_MAX, "%s%s/etc/resolv.conf", kvm__get_dir(), guestfs_name);
+
+       copy_file("/etc/resolv.conf", path);
+}
+
+static int do_setup(const char *guestfs_name)
+{
+       unsigned int i;
+       int ret;
+
+       ret = make_dir(guestfs_name);
+       if (ret < 0)
+               return ret;
+
+       for (i = 0; i < ARRAY_SIZE(guestfs_dirs); i++)
+               make_guestfs_dir(guestfs_name, guestfs_dirs[i]);
+
+       for (i = 0; i < ARRAY_SIZE(guestfs_symlinks); i++) {
+               make_guestfs_symlink(guestfs_name, guestfs_symlinks[i]);
+       }
+
+       ret = copy_init(guestfs_name);
+       if (ret < 0)
+               return ret;
+
+       return copy_passwd(guestfs_name);
+}
+
+int kvm_setup_create_new(const char *guestfs_name)
+{
+       return do_setup(guestfs_name);
+}
+
+int kvm_cmd_setup(int argc, const char **argv, const char *prefix)
+{
+       int r;
+
+       parse_setup_options(argc, argv);
+
+       if (instance_name == NULL)
+               kvm_setup_help();
+
+       r = do_setup(instance_name);
+       if (r == 0)
+               printf("A new rootfs '%s' has been created in '%s%s'.\n\n"
+                       "You can now start it by running the following command:\n\n"
+                       "  %s run -d %s\n",
+                       instance_name, kvm__get_dir(), instance_name,
+                       KVM_BINARY_NAME,instance_name);
+       else
+               printf("Unable to create rootfs in %s%s: %s\n",
+                       kvm__get_dir(), instance_name, strerror(errno));
+
+       return r;
+}
diff --git a/tools/kvm/builtin-stat.c b/tools/kvm/builtin-stat.c

new file mode 100644 (file)

index 0000000..ffd72e8
--- /dev/null
+++ b/tools/kvm/builtin-stat.c
@@ -0,0 +1,127 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-stat.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <sys/select.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+#include <linux/virtio_balloon.h>
+
+static bool mem;
+static bool all;
+static const char *instance_name;
+
+static const char * const stat_usage[] = {
+       "lkvm stat [command] [--all] [-n name]",
+       NULL
+};
+
+static const struct option stat_options[] = {
+       OPT_GROUP("Commands options:"),
+       OPT_BOOLEAN('m', "memory", &mem, "Display memory statistics"),
+       OPT_GROUP("Instance options:"),
+       OPT_BOOLEAN('a', "all", &all, "All instances"),
+       OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+       OPT_END()
+};
+
+static void parse_stat_options(int argc, const char **argv)
+{
+       while (argc != 0) {
+               argc = parse_options(argc, argv, stat_options, stat_usage,
+                               PARSE_OPT_STOP_AT_NON_OPTION);
+               if (argc != 0)
+                       kvm_stat_help();
+       }
+}
+
+void kvm_stat_help(void)
+{
+       usage_with_options(stat_usage, stat_options);
+}
+
+static int do_memstat(const char *name, int sock)
+{
+       struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
+       fd_set fdset;
+       struct timeval t = { .tv_sec = 1 };
+       int r;
+       u8 i;
+
+       FD_ZERO(&fdset);
+       FD_SET(sock, &fdset);
+       r = kvm_ipc__send(sock, KVM_IPC_STAT);
+       if (r < 0)
+               return r;
+
+       r = select(1, &fdset, NULL, NULL, &t);
+       if (r < 0) {
+               pr_err("Could not retrieve mem stats from %s", name);
+               return r;
+       }
+       r = read(sock, &stats, sizeof(stats));
+       if (r < 0)
+               return r;
+
+       printf("\n\n\t*** Guest memory statistics ***\n\n");
+       for (i = 0; i < VIRTIO_BALLOON_S_NR; i++) {
+               switch (stats[i].tag) {
+               case VIRTIO_BALLOON_S_SWAP_IN:
+                       printf("The amount of memory that has been swapped in (in bytes):");
+                       break;
+               case VIRTIO_BALLOON_S_SWAP_OUT:
+                       printf("The amount of memory that has been swapped out to disk (in bytes):");
+                       break;
+               case VIRTIO_BALLOON_S_MAJFLT:
+                       printf("The number of major page faults that have occurred:");
+                       break;
+               case VIRTIO_BALLOON_S_MINFLT:
+                       printf("The number of minor page faults that have occurred:");
+                       break;
+               case VIRTIO_BALLOON_S_MEMFREE:
+                       printf("The amount of memory not being used for any purpose (in bytes):");
+                       break;
+               case VIRTIO_BALLOON_S_MEMTOT:
+                       printf("The total amount of memory available (in bytes):");
+                       break;
+               }
+               printf("%llu\n", stats[i].val);
+       }
+       printf("\n");
+
+       return 0;
+}
+
+int kvm_cmd_stat(int argc, const char **argv, const char *prefix)
+{
+       int instance;
+       int r = 0;
+
+       parse_stat_options(argc, argv);
+
+       if (!mem)
+               usage_with_options(stat_usage, stat_options);
+
+       if (mem && all)
+               return kvm__enumerate_instances(do_memstat);
+
+       if (instance_name == NULL)
+               kvm_stat_help();
+
+       instance = kvm__get_sock_by_instance(instance_name);
+
+       if (instance <= 0)
+               die("Failed locating instance");
+
+       if (mem)
+               r = do_memstat(instance_name, instance);
+
+       close(instance);
+
+       return r;
+}
diff --git a/tools/kvm/builtin-stop.c b/tools/kvm/builtin-stop.c

new file mode 100644 (file)

index 0000000..6067630
--- /dev/null
+++ b/tools/kvm/builtin-stop.c
@@ -0,0 +1,70 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-stop.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool all;
+static const char *instance_name;
+
+static const char * const stop_usage[] = {
+       "lkvm stop [--all] [-n name]",
+       NULL
+};
+
+static const struct option stop_options[] = {
+       OPT_GROUP("General options:"),
+       OPT_BOOLEAN('a', "all", &all, "Stop all instances"),
+       OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+       OPT_END()
+};
+
+static void parse_stop_options(int argc, const char **argv)
+{
+       while (argc != 0) {
+               argc = parse_options(argc, argv, stop_options, stop_usage,
+                               PARSE_OPT_STOP_AT_NON_OPTION);
+               if (argc != 0)
+                       kvm_stop_help();
+       }
+}
+
+void kvm_stop_help(void)
+{
+       usage_with_options(stop_usage, stop_options);
+}
+
+static int do_stop(const char *name, int sock)
+{
+       return kvm_ipc__send(sock, KVM_IPC_STOP);
+}
+
+int kvm_cmd_stop(int argc, const char **argv, const char *prefix)
+{
+       int instance;
+       int r;
+
+       parse_stop_options(argc, argv);
+
+       if (all)
+               return kvm__enumerate_instances(do_stop);
+
+       if (instance_name == NULL)
+               kvm_stop_help();
+
+       instance = kvm__get_sock_by_instance(instance_name);
+
+       if (instance <= 0)
+               die("Failed locating instance");
+
+       r = do_stop(instance_name, instance);
+
+       close(instance);
+
+       return r;
+}
diff --git a/tools/kvm/builtin-version.c b/tools/kvm/builtin-version.c

new file mode 100644 (file)

index 0000000..b8bb859
--- /dev/null
+++ b/tools/kvm/builtin-version.c
@@ -0,0 +1,15 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-version.h>
+#include <kvm/kvm.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+int kvm_cmd_version(int argc, const char **argv, const char *prefix)
+{
+       printf("kvm tool %s\n", KVMTOOLS_VERSION);
+
+       return 0;
+}
diff --git a/tools/kvm/code16gcc.h b/tools/kvm/code16gcc.h

new file mode 100644 (file)

index 0000000..d93e480
--- /dev/null
+++ b/tools/kvm/code16gcc.h
@@ -0,0 +1,15 @@
+/*
+ * code16gcc.h
+ *
+ * This file is -include'd when compiling 16-bit C code.
+ * Note: this asm() needs to be emitted before gcc emits any code.
+ * Depending on gcc version, this requires -fno-unit-at-a-time or
+ * -fno-toplevel-reorder.
+ *
+ * Hopefully gcc will eventually have a real -m16 option so we can
+ * drop this hack long term.
+ */
+
+#ifndef __ASSEMBLY__
+asm(".code16gcc");
+#endif
diff --git a/tools/kvm/command-list.txt b/tools/kvm/command-list.txt

new file mode 100644 (file)

index 0000000..d93597d
--- /dev/null
+++ b/tools/kvm/command-list.txt
@@ -0,0 +1,15 @@
+#
+# List of known perf commands.
+# command name                 category [deprecated] [common]
+#
+lkvm-run                       mainporcelain common
+lkvm-setup                     mainporcelain common
+lkvm-pause                     common
+lkvm-resume                    common
+lkvm-version                   common
+lkvm-list                      common
+lkvm-debug                     common
+lkvm-balloon                   common
+lkvm-stop                      common
+lkvm-stat                      common
+lkvm-sandbox                   common
diff --git a/tools/kvm/config/feature-tests.mak b/tools/kvm/config/feature-tests.mak

new file mode 100644 (file)

index 0000000..4a81f56
--- /dev/null
+++ b/tools/kvm/config/feature-tests.mak
@@ -0,0 +1,177 @@
+define SOURCE_HELLO
+#include <stdio.h>
+int main(void)
+{
+       return puts(\"hi\");
+}
+endef
+
+ifndef NO_DWARF
+define SOURCE_DWARF
+#include <dwarf.h>
+#include <elfutils/libdw.h>
+#include <elfutils/version.h>
+#ifndef _ELFUTILS_PREREQ
+#error
+#endif
+
+int main(void)
+{
+       Dwarf *dbg = dwarf_begin(0, DWARF_C_READ);
+       return (long)dbg;
+}
+endef
+endif
+
+define SOURCE_LIBELF
+#include <libelf.h>
+
+int main(void)
+{
+       Elf *elf = elf_begin(0, ELF_C_READ, 0);
+       return (long)elf;
+}
+endef
+
+define SOURCE_GLIBC
+#include <gnu/libc-version.h>
+
+int main(void)
+{
+       const char *version = gnu_get_libc_version();
+       return (long)version;
+}
+endef
+
+define SOURCE_ELF_MMAP
+#include <libelf.h>
+int main(void)
+{
+       Elf *elf = elf_begin(0, ELF_C_READ_MMAP, 0);
+       return (long)elf;
+}
+endef
+
+ifndef NO_NEWT
+define SOURCE_NEWT
+#include <newt.h>
+
+int main(void)
+{
+       newtInit();
+       newtCls();
+       return newtFinished();
+}
+endef
+endif
+
+ifndef NO_LIBPERL
+define SOURCE_PERL_EMBED
+#include <EXTERN.h>
+#include <perl.h>
+
+int main(void)
+{
+perl_alloc();
+return 0;
+}
+endef
+endif
+
+ifndef NO_LIBPYTHON
+define SOURCE_PYTHON_VERSION
+#include <Python.h>
+#if PY_VERSION_HEX >= 0x03000000
+       #error
+#endif
+int main(void){}
+endef
+define SOURCE_PYTHON_EMBED
+#include <Python.h>
+int main(void)
+{
+       Py_Initialize();
+       return 0;
+}
+endef
+endif
+
+define SOURCE_BFD
+#include <bfd.h>
+
+int main(void)
+{
+       bfd_demangle(0, 0, 0);
+       return 0;
+}
+endef
+
+define SOURCE_CPLUS_DEMANGLE
+extern char *cplus_demangle(const char *, int);
+
+int main(void)
+{
+       cplus_demangle(0, 0);
+       return 0;
+}
+endef
+
+define SOURCE_STRLCPY
+#include <stdlib.h>
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+
+int main(void)
+{
+       strlcpy(NULL, NULL, 0);
+       return 0;
+}
+endef
+
+define SOURCE_VNCSERVER
+#include <rfb/rfb.h>
+
+int main(void)
+{
+       rfbIsActive((void *)0);
+       return 0;
+}
+endef
+
+define SOURCE_SDL
+#include <SDL/SDL.h>
+
+int main(void)
+{
+       SDL_Init(SDL_INIT_VIDEO);
+       return 0;
+}
+endef
+
+define SOURCE_ZLIB
+#include <zlib.h>
+
+int main(void)
+{
+       inflateInit2(NULL, 0);
+       return 0;
+}
+endef
+
+define SOURCE_AIO
+#include <libaio.h>
+
+int main(void)
+{
+       io_setup(0, NULL);
+       return 0;
+}
+endef
+
+define SOURCE_STATIC
+#include <stdlib.h>
+
+int main(void)
+{
+       return 0;
+}
+endef
diff --git a/tools/kvm/config/utilities.mak b/tools/kvm/config/utilities.mak

new file mode 100644 (file)

index 0000000..a70963b
--- /dev/null
+++ b/tools/kvm/config/utilities.mak
@@ -0,0 +1,196 @@
+# This allows us to work with the newline character:
+define newline
+
+
+endef
+newline := $(newline)
+
+# nl-escape
+#
+# Usage: escape = $(call nl-escape[,escape])
+#
+# This is used as the common way to specify
+# what should replace a newline when escaping
+# newlines; the default is a bizarre string.
+#
+nl-escape = $(or $(1),m822df3020w6a44id34bt574ctac44eb9f4n)
+
+# escape-nl
+#
+# Usage: escaped-text = $(call escape-nl,text[,escape])
+#
+# GNU make's $(shell ...) function converts to a
+# single space each newline character in the output
+# produced during the expansion; this may not be
+# desirable.
+#
+# The only solution is to change each newline into
+# something that won't be converted, so that the
+# information can be recovered later with
+# $(call unescape-nl...)
+#
+escape-nl = $(subst $(newline),$(call nl-escape,$(2)),$(1))
+
+# unescape-nl
+#
+# Usage: text = $(call unescape-nl,escaped-text[,escape])
+#
+# See escape-nl.
+#
+unescape-nl = $(subst $(call nl-escape,$(2)),$(newline),$(1))
+
+# shell-escape-nl
+#
+# Usage: $(shell some-command | $(call shell-escape-nl[,escape]))
+#
+# Use this to escape newlines from within a shell call;
+# the default escape is a bizarre string.
+#
+# NOTE: The escape is used directly as a string constant
+#       in an `awk' program that is delimited by shell
+#       single-quotes, so be wary of the characters
+#       that are chosen.
+#
+define shell-escape-nl
+awk 'NR==1 {t=$$0} NR>1 {t=t "$(nl-escape)" $$0} END {printf t}'
+endef
+
+# shell-unescape-nl
+#
+# Usage: $(shell some-command | $(call shell-unescape-nl[,escape]))
+#
+# Use this to unescape newlines from within a shell call;
+# the default escape is a bizarre string.
+#
+# NOTE: The escape is used directly as an extended regular
+#       expression constant in an `awk' program that is
+#       delimited by shell single-quotes, so be wary
+#       of the characters that are chosen.
+#
+# (The bash shell has a bug where `{gsub(...),...}' is
+#  misinterpreted as a brace expansion; this can be
+#  overcome by putting a space between `{' and `gsub').
+#
+define shell-unescape-nl
+awk 'NR==1 {t=$$0} NR>1 {t=t "\n" $$0} END { gsub(/$(nl-escape)/,"\n",t); printf t }'
+endef
+
+# escape-for-shell-sq
+#
+# Usage: embeddable-text = $(call escape-for-shell-sq,text)
+#
+# This function produces text that is suitable for
+# embedding in a shell string that is delimited by
+# single-quotes.
+#
+escape-for-shell-sq =  $(subst ','\'',$(1))
+
+# shell-sq
+#
+# Usage: single-quoted-and-escaped-text = $(call shell-sq,text)
+#
+shell-sq = '$(escape-for-shell-sq)'
+
+# shell-wordify
+#
+# Usage: wordified-text = $(call shell-wordify,text)
+#
+# For instance:
+#
+#  |define text
+#  |hello
+#  |world
+#  |endef
+#  |
+#  |target:
+#  |   echo $(call shell-wordify,$(text))
+#
+# At least GNU make gets confused by expanding a newline
+# within the context of a command line of a makefile rule
+# (this is in constrast to a `$(shell ...)' function call,
+# which can handle it just fine).
+#
+# This function avoids the problem by producing a string
+# that works as a shell word, regardless of whether or
+# not it contains a newline.
+#
+# If the text to be wordified contains a newline, then
+# an intrictate shell command substitution is constructed
+# to render the text as a single line; when the shell
+# processes the resulting escaped text, it transforms
+# it into the original unescaped text.
+#
+# If the text does not contain a newline, then this function
+# produces the same results as the `$(shell-sq)' function.
+#
+shell-wordify = $(if $(findstring $(newline),$(1)),$(_sw-esc-nl),$(shell-sq))
+define _sw-esc-nl
+"$$(echo $(call escape-nl,$(shell-sq),$(2)) | $(call shell-unescape-nl,$(2)))"
+endef
+
+# is-absolute
+#
+# Usage: bool-value = $(call is-absolute,path)
+#
+is-absolute = $(shell echo $(shell-sq) | grep ^/ -q && echo y)
+
+# lookup
+#
+# Usage: absolute-executable-path-or-empty = $(call lookup,path)
+#
+# (It's necessary to use `sh -c' because GNU make messes up by
+#  trying too hard and getting things wrong).
+#
+lookup = $(call unescape-nl,$(shell sh -c $(_l-sh)))
+_l-sh = $(call shell-sq,command -v $(shell-sq) | $(call shell-escape-nl,))
+
+# is-executable
+#
+# Usage: bool-value = $(call is-executable,path)
+#
+# (It's necessary to use `sh -c' because GNU make messes up by
+#  trying too hard and getting things wrong).
+#
+is-executable = $(call _is-executable-helper,$(shell-sq))
+_is-executable-helper = $(shell sh -c $(_is-executable-sh))
+_is-executable-sh = $(call shell-sq,test -f $(1) -a -x $(1) && echo y)
+
+# get-executable
+#
+# Usage: absolute-executable-path-or-empty = $(call get-executable,path)
+#
+# The goal is to get an absolute path for an executable;
+# the `command -v' is defined by POSIX, but it's not
+# necessarily very portable, so it's only used if
+# relative path resolution is requested, as determined
+# by the presence of a leading `/'.
+#
+get-executable = $(if $(1),$(if $(is-absolute),$(_ge-abspath),$(lookup)))
+_ge-abspath = $(if $(is-executable),$(1))
+
+# get-supplied-or-default-executable
+#
+# Usage: absolute-executable-path-or-empty = $(call get-executable-or-default,variable,default)
+#
+define get-executable-or-default
+$(if $($(1)),$(call _ge_attempt,$($(1)),$(1)),$(call _ge_attempt,$(2)))
+endef
+_ge_attempt = $(or $(get-executable),$(_gea_warn),$(call _gea_err,$(2)))
+_gea_warn = $(warning The path '$(1)' is not executable.)
+_gea_err  = $(if $(1),$(error Please set '$(1)' appropriately))
+
+# try-cc
+# Usage: option = $(call try-cc, source-to-build, cc-options)
+try-cc = $(shell sh -c                                           \
+       'TMP="$(OUTPUT)$(TMPOUT).$$$$";                           \
+        echo "$(1)" |                                            \
+        $(CC) -x c - $(2) -o "$$TMP" > /dev/null 2>&1 && echo y; \
+        rm -f "$$TMP"')
+
+# try-build
+# Usage: option = $(call try-build, source-to-build, cc-options, link-options)
+try-build = $(shell sh -c                                                      \
+       'TMP="$(OUTPUT)$(TMPOUT).$$$$";                                         \
+       echo "$(1)" |                                                           \
+       $(CC) -x c - $(2) $(3) -o "$$TMP" > /dev/null 2>&1 && echo y;           \
+       rm -f "$$TMP"')
diff --git a/tools/kvm/disk/blk.c b/tools/kvm/disk/blk.c

new file mode 100644 (file)

index 0000000..37581d3
--- /dev/null
+++ b/tools/kvm/disk/blk.c
@@ -0,0 +1,76 @@
+#include "kvm/disk-image.h"
+
+#include <linux/err.h>
+#include <mntent.h>
+
+/*
+ * raw image and blk dev are similar, so reuse raw image ops.
+ */
+static struct disk_image_operations blk_dev_ops = {
+       .read   = raw_image__read,
+       .write  = raw_image__write,
+};
+
+static bool is_mounted(struct stat *st)
+{
+       struct stat st_buf;
+       struct mntent *mnt;
+       FILE *f;
+
+       f = setmntent("/proc/mounts", "r");
+       if (!f)
+               return false;
+
+       while ((mnt = getmntent(f)) != NULL) {
+               if (stat(mnt->mnt_fsname, &st_buf) == 0 &&
+                   S_ISBLK(st_buf.st_mode) && st->st_rdev == st_buf.st_rdev) {
+                       fclose(f);
+                       return true;
+               }
+       }
+
+       fclose(f);
+       return false;
+}
+
+struct disk_image *blkdev__probe(const char *filename, int flags, struct stat *st)
+{
+       struct disk_image *disk;
+       int fd, r;
+       u64 size;
+
+       if (!S_ISBLK(st->st_mode))
+               return ERR_PTR(-EINVAL);
+
+       if (is_mounted(st)) {
+               pr_err("Block device %s is already mounted! Unmount before use.",
+                      filename);
+               return ERR_PTR(-EINVAL);
+       }
+
+       /*
+        * Be careful! We are opening host block device!
+        * Open it readonly since we do not want to break user's data on disk.
+        */
+       fd = open(filename, flags);
+       if (fd < 0)
+               return ERR_PTR(fd);
+
+       if (ioctl(fd, BLKGETSIZE64, &size) < 0) {
+               r = -errno;
+               close(fd);
+               return ERR_PTR(r);
+       }
+
+       /*
+        * FIXME: This will not work on 32-bit host because we can not
+        * mmap large disk. There is not enough virtual address space
+        * in 32-bit host. However, this works on 64-bit host.
+        */
+       disk = disk_image__new(fd, size, &blk_dev_ops, DISK_IMAGE_REGULAR);
+#ifdef CONFIG_HAS_AIO
+               if (!IS_ERR_OR_NULL(disk))
+                       disk->async = 1;
+#endif
+       return disk;
+}
diff --git a/tools/kvm/disk/core.c b/tools/kvm/disk/core.c

new file mode 100644 (file)

index 0000000..f7e2c7f
--- /dev/null
+++ b/tools/kvm/disk/core.c
@@ -0,0 +1,289 @@
+#include "kvm/disk-image.h"
+#include "kvm/qcow.h"
+#include "kvm/virtio-blk.h"
+
+#include <linux/err.h>
+#include <sys/eventfd.h>
+#include <sys/poll.h>
+
+#define AIO_MAX 256
+
+int debug_iodelay;
+
+#ifdef CONFIG_HAS_AIO
+static void *disk_image__thread(void *param)
+{
+       struct disk_image *disk = param;
+       struct io_event event[AIO_MAX];
+       struct timespec notime = {0};
+       int nr, i;
+       u64 dummy;
+
+       while (read(disk->evt, &dummy, sizeof(dummy)) > 0) {
+               nr = io_getevents(disk->ctx, 1, ARRAY_SIZE(event), event, &notime);
+               for (i = 0; i < nr; i++)
+                       disk->disk_req_cb(event[i].data, event[i].res);
+       }
+
+       return NULL;
+}
+#endif
+
+struct disk_image *disk_image__new(int fd, u64 size,
+                                  struct disk_image_operations *ops,
+                                  int use_mmap)
+{
+       struct disk_image *disk;
+       int r;
+
+       disk = malloc(sizeof *disk);
+       if (!disk)
+               return ERR_PTR(-ENOMEM);
+
+       *disk = (struct disk_image) {
+               .fd     = fd,
+               .size   = size,
+               .ops    = ops,
+       };
+
+       if (use_mmap == DISK_IMAGE_MMAP) {
+               /*
+                * The write to disk image will be discarded
+                */
+               disk->priv = mmap(NULL, size, PROT_RW, MAP_PRIVATE | MAP_NORESERVE, fd, 0);
+               if (disk->priv == MAP_FAILED) {
+                       r = -errno;
+                       free(disk);
+                       return ERR_PTR(r);
+               }
+       }
+
+#ifdef CONFIG_HAS_AIO
+       if (disk) {
+               pthread_t thread;
+
+               disk->evt = eventfd(0, 0);
+               io_setup(AIO_MAX, &disk->ctx);
+               r = pthread_create(&thread, NULL, disk_image__thread, disk);
+               if (r) {
+                       r = -errno;
+                       free(disk);
+                       return ERR_PTR(r);
+               }
+       }
+#endif
+       return disk;
+}
+
+struct disk_image *disk_image__open(const char *filename, bool readonly, bool direct)
+{
+       struct disk_image *disk;
+       struct stat st;
+       int fd, flags;
+
+       if (readonly)
+               flags = O_RDONLY;
+       else
+               flags = O_RDWR;
+       if (direct)
+               flags |= O_DIRECT;
+
+       if (stat(filename, &st) < 0)
+               return ERR_PTR(-errno);
+
+       /* blk device ?*/
+       disk = blkdev__probe(filename, flags, &st);
+       if (!IS_ERR_OR_NULL(disk))
+               return disk;
+
+       fd = open(filename, flags);
+       if (fd < 0)
+               return ERR_PTR(fd);
+
+       /* qcow image ?*/
+       disk = qcow_probe(fd, true);
+       if (!IS_ERR_OR_NULL(disk)) {
+               pr_warning("Forcing read-only support for QCOW");
+               return disk;
+       }
+
+       /* raw image ?*/
+       disk = raw_image__probe(fd, &st, readonly);
+       if (!IS_ERR_OR_NULL(disk))
+               return disk;
+
+       if (close(fd) < 0)
+               pr_warning("close() failed");
+
+       return ERR_PTR(-ENOSYS);
+}
+
+struct disk_image **disk_image__open_all(struct disk_image_params *params, int count)
+{
+       struct disk_image **disks;
+       const char *filename;
+       const char *wwpn;
+       const char *tpgt;
+       bool readonly;
+       bool direct;
+       void *err;
+       int i;
+
+       if (!count)
+               return ERR_PTR(-EINVAL);
+       if (count > MAX_DISK_IMAGES)
+               return ERR_PTR(-ENOSPC);
+
+       disks = calloc(count, sizeof(*disks));
+       if (!disks)
+               return ERR_PTR(-ENOMEM);
+
+       for (i = 0; i < count; i++) {
+               filename = params[i].filename;
+               readonly = params[i].readonly;
+               direct = params[i].direct;
+               wwpn = params[i].wwpn;
+               tpgt = params[i].tpgt;
+
+               if (wwpn) {
+                       disks[i] = malloc(sizeof(struct disk_image));
+                       if (!disks[i])
+                               return ERR_PTR(-ENOMEM);
+                       disks[i]->wwpn = wwpn;
+                       disks[i]->tpgt = tpgt;
+                       continue;
+               }
+
+               if (!filename)
+                       continue;
+
+               disks[i] = disk_image__open(filename, readonly, direct);
+               if (IS_ERR_OR_NULL(disks[i])) {
+                       pr_err("Loading disk image '%s' failed", filename);
+                       err = disks[i];
+                       goto error;
+               }
+       }
+
+       return disks;
+error:
+       for (i = 0; i < count; i++)
+               if (!IS_ERR_OR_NULL(disks[i]))
+                       disk_image__close(disks[i]);
+
+       free(disks);
+       return err;
+}
+
+int disk_image__flush(struct disk_image *disk)
+{
+       if (disk->ops->flush)
+               return disk->ops->flush(disk);
+
+       return fsync(disk->fd);
+}
+
+int disk_image__close(struct disk_image *disk)
+{
+       /* If there was no disk image then there's nothing to do: */
+       if (!disk)
+               return 0;
+
+       if (disk->ops->close)
+               return disk->ops->close(disk);
+
+       if (close(disk->fd) < 0)
+               pr_warning("close() failed");
+
+       free(disk);
+
+       return 0;
+}
+
+int disk_image__close_all(struct disk_image **disks, int count)
+{
+       while (count)
+               disk_image__close(disks[--count]);
+
+       free(disks);
+
+       return 0;
+}
+
+/*
+ * Fill iov with disk data, starting from sector 'sector'.
+ * Return amount of bytes read.
+ */
+ssize_t disk_image__read(struct disk_image *disk, u64 sector,
+                        const struct iovec *iov, int iovcount, void *param)
+{
+       ssize_t total = 0;
+
+       if (debug_iodelay)
+               msleep(debug_iodelay);
+
+       if (disk->ops->read) {
+               total = disk->ops->read(disk, sector, iov, iovcount, param);
+               if (total < 0) {
+                       pr_info("disk_image__read error: total=%ld\n", (long)total);
+                       return total;
+               }
+       }
+
+       if (!disk->async && disk->disk_req_cb)
+               disk->disk_req_cb(param, total);
+
+       return total;
+}
+
+/*
+ * Write iov to disk, starting from sector 'sector'.
+ * Return amount of bytes written.
+ */
+ssize_t disk_image__write(struct disk_image *disk, u64 sector,
+                         const struct iovec *iov, int iovcount, void *param)
+{
+       ssize_t total = 0;
+
+       if (debug_iodelay)
+               msleep(debug_iodelay);
+
+       if (disk->ops->write) {
+               /*
+                * Try writev based operation first
+                */
+
+               total = disk->ops->write(disk, sector, iov, iovcount, param);
+               if (total < 0) {
+                       pr_info("disk_image__write error: total=%ld\n", (long)total);
+                       return total;
+               }
+       } else {
+               /* Do nothing */
+       }
+
+       if (!disk->async && disk->disk_req_cb)
+               disk->disk_req_cb(param, total);
+
+       return total;
+}
+
+ssize_t disk_image__get_serial(struct disk_image *disk, void *buffer, ssize_t *len)
+{
+       struct stat st;
+       int r;
+
+       r = fstat(disk->fd, &st);
+       if (r)
+               return r;
+
+       *len = snprintf(buffer, *len, "%llu%llu%llu",
+                       (u64)st.st_dev, (u64)st.st_rdev, (u64)st.st_ino);
+       return *len;
+}
+
+void disk_image__set_callback(struct disk_image *disk,
+                             void (*disk_req_cb)(void *param, long len))
+{
+       disk->disk_req_cb = disk_req_cb;
+}
diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c

new file mode 100644 (file)

index 0000000..ee2992e
--- /dev/null
+++ b/tools/kvm/disk/qcow.c
@@ -0,0 +1,1529 @@
+#include "kvm/qcow.h"
+
+#include "kvm/disk-image.h"
+#include "kvm/read-write.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#ifdef CONFIG_HAS_ZLIB
+#include <zlib.h>
+#endif
+
+#include <linux/err.h>
+#include <linux/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append);
+static int qcow_write_refcount_table(struct qcow *q);
+static u64 qcow_alloc_clusters(struct qcow *q, u64 size, int update_ref);
+static void  qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size);
+
+static inline int qcow_pwrite_sync(int fd,
+       void *buf, size_t count, off_t offset)
+{
+       if (pwrite_in_full(fd, buf, count, offset) < 0)
+               return -1;
+
+       return fdatasync(fd);
+}
+
+static int l2_table_insert(struct rb_root *root, struct qcow_l2_table *new)
+{
+       struct rb_node **link = &(root->rb_node), *parent = NULL;
+       u64 offset = new->offset;
+
+       /* search the tree */
+       while (*link) {
+               struct qcow_l2_table *t;
+
+               t = rb_entry(*link, struct qcow_l2_table, node);
+               if (!t)
+                       goto error;
+
+               parent = *link;
+
+               if (t->offset > offset)
+                       link = &(*link)->rb_left;
+               else if (t->offset < offset)
+                       link = &(*link)->rb_right;
+               else
+                       goto out;
+       }
+
+       /* add new node */
+       rb_link_node(&new->node, parent, link);
+       rb_insert_color(&new->node, root);
+out:
+       return 0;
+error:
+       return -1;
+}
+
+static struct qcow_l2_table *l2_table_lookup(struct rb_root *root, u64 offset)
+{
+       struct rb_node *link = root->rb_node;
+
+       while (link) {
+               struct qcow_l2_table *t;
+
+               t = rb_entry(link, struct qcow_l2_table, node);
+               if (!t)
+                       goto out;
+
+               if (t->offset > offset)
+                       link = link->rb_left;
+               else if (t->offset < offset)
+                       link = link->rb_right;
+               else
+                       return t;
+       }
+out:
+       return NULL;
+}
+
+static void l1_table_free_cache(struct qcow_l1_table *l1t)
+{
+       struct rb_root *r = &l1t->root;
+       struct list_head *pos, *n;
+       struct qcow_l2_table *t;
+
+       list_for_each_safe(pos, n, &l1t->lru_list) {
+               /* Remove cache table from the list and RB tree */
+               list_del(pos);
+               t = list_entry(pos, struct qcow_l2_table, list);
+               rb_erase(&t->node, r);
+
+               /* Free the cached node */
+               free(t);
+       }
+}
+
+static int qcow_l2_cache_write(struct qcow *q, struct qcow_l2_table *c)
+{
+       struct qcow_header *header = q->header;
+       u64 size;
+
+       if (!c->dirty)
+               return 0;
+
+       size = 1 << header->l2_bits;
+
+       if (qcow_pwrite_sync(q->fd, c->table,
+               size * sizeof(u64), c->offset) < 0)
+               return -1;
+
+       c->dirty = 0;
+
+       return 0;
+}
+
+static int cache_table(struct qcow *q, struct qcow_l2_table *c)
+{
+       struct qcow_l1_table *l1t = &q->table;
+       struct rb_root *r = &l1t->root;
+       struct qcow_l2_table *lru;
+
+       if (l1t->nr_cached == MAX_CACHE_NODES) {
+               /*
+                * The node at the head of the list is least recently used
+                * node. Remove it from the list and replaced with a new node.
+                */
+               lru = list_first_entry(&l1t->lru_list, struct qcow_l2_table, list);
+
+               /* Remove the node from the cache */
+               rb_erase(&lru->node, r);
+               list_del_init(&lru->list);
+               l1t->nr_cached--;
+
+               /* Free the LRUed node */
+               free(lru);
+       }
+
+       /* Add new node in RB Tree: Helps in searching faster */
+       if (l2_table_insert(r, c) < 0)
+               goto error;
+
+       /* Add in LRU replacement list */
+       list_add_tail(&c->list, &l1t->lru_list);
+       l1t->nr_cached++;
+
+       return 0;
+error:
+       return -1;
+}
+
+static struct qcow_l2_table *l2_table_search(struct qcow *q, u64 offset)
+{
+       struct qcow_l1_table *l1t = &q->table;
+       struct qcow_l2_table *l2t;
+
+       l2t = l2_table_lookup(&l1t->root, offset);
+       if (!l2t)
+               return NULL;
+
+       /* Update the LRU state, by moving the searched node to list tail */
+       list_move_tail(&l2t->list, &l1t->lru_list);
+
+       return l2t;
+}
+
+/* Allocates a new node for caching L2 table */
+static struct qcow_l2_table *new_cache_table(struct qcow *q, u64 offset)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_l2_table *c;
+       u64 l2t_sz;
+       u64 size;
+
+       l2t_sz = 1 << header->l2_bits;
+       size   = sizeof(*c) + l2t_sz * sizeof(u64);
+       c      = calloc(1, size);
+       if (!c)
+               goto out;
+
+       c->offset = offset;
+       RB_CLEAR_NODE(&c->node);
+       INIT_LIST_HEAD(&c->list);
+out:
+       return c;
+}
+
+static inline u64 get_l1_index(struct qcow *q, u64 offset)
+{
+       struct qcow_header *header = q->header;
+
+       return offset >> (header->l2_bits + header->cluster_bits);
+}
+
+static inline u64 get_l2_index(struct qcow *q, u64 offset)
+{
+       struct qcow_header *header = q->header;
+
+       return (offset >> (header->cluster_bits)) & ((1 << header->l2_bits)-1);
+}
+
+static inline u64 get_cluster_offset(struct qcow *q, u64 offset)
+{
+       struct qcow_header *header = q->header;
+
+       return offset & ((1 << header->cluster_bits)-1);
+}
+
+static struct qcow_l2_table *qcow_read_l2_table(struct qcow *q, u64 offset)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_l2_table *l2t;
+       u64 size;
+
+       size = 1 << header->l2_bits;
+
+       /* search an entry for offset in cache */
+       l2t = l2_table_search(q, offset);
+       if (l2t)
+               return l2t;
+
+       /* allocate new node for caching l2 table */
+       l2t = new_cache_table(q, offset);
+       if (!l2t)
+               goto error;
+
+       /* table not cached: read from the disk */
+       if (pread_in_full(q->fd, l2t->table, size * sizeof(u64), offset) < 0)
+               goto error;
+
+       /* cache the table */
+       if (cache_table(q, l2t) < 0)
+               goto error;
+
+       return l2t;
+error:
+       free(l2t);
+       return NULL;
+}
+
+static int qcow_decompress_buffer(u8 *out_buf, int out_buf_size,
+       const u8 *buf, int buf_size)
+{
+#ifdef CONFIG_HAS_ZLIB
+       z_stream strm1, *strm = &strm1;
+       int ret, out_len;
+
+       memset(strm, 0, sizeof(*strm));
+
+       strm->next_in   = (u8 *)buf;
+       strm->avail_in  = buf_size;
+       strm->next_out  = out_buf;
+       strm->avail_out = out_buf_size;
+
+       ret = inflateInit2(strm, -12);
+       if (ret != Z_OK)
+               return -1;
+
+       ret = inflate(strm, Z_FINISH);
+       out_len = strm->next_out - out_buf;
+       if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+               out_len != out_buf_size) {
+               inflateEnd(strm);
+               return -1;
+       }
+
+       inflateEnd(strm);
+       return 0;
+#else
+       return -1;
+#endif
+}
+
+static ssize_t qcow1_read_cluster(struct qcow *q, u64 offset,
+       void *dst, u32 dst_len)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_l1_table *l1t = &q->table;
+       struct qcow_l2_table *l2t;
+       u64 clust_offset;
+       u64 clust_start;
+       u64 l2t_offset;
+       size_t length;
+       u64 l2t_size;
+       u64 l1_idx;
+       u64 l2_idx;
+       int coffset;
+       int csize;
+
+       l1_idx = get_l1_index(q, offset);
+       if (l1_idx >= l1t->table_size)
+               return -1;
+
+       clust_offset = get_cluster_offset(q, offset);
+       if (clust_offset >= q->cluster_size)
+               return -1;
+
+       length = q->cluster_size - clust_offset;
+       if (length > dst_len)
+               length = dst_len;
+
+       mutex_lock(&q->mutex);
+
+       l2t_offset = be64_to_cpu(l1t->l1_table[l1_idx]);
+       if (!l2t_offset)
+               goto zero_cluster;
+
+       l2t_size = 1 << header->l2_bits;
+
+       /* read and cache level 2 table */
+       l2t = qcow_read_l2_table(q, l2t_offset);
+       if (!l2t)
+               goto out_error;
+
+       l2_idx = get_l2_index(q, offset);
+       if (l2_idx >= l2t_size)
+               goto out_error;
+
+       clust_start = be64_to_cpu(l2t->table[l2_idx]);
+       if (clust_start & QCOW1_OFLAG_COMPRESSED) {
+               coffset = clust_start & q->cluster_offset_mask;
+               csize   = clust_start >> (63 - q->header->cluster_bits);
+               csize   &= (q->cluster_size - 1);
+
+               if (pread_in_full(q->fd, q->cluster_data, csize,
+                                 coffset) < 0)
+                       goto out_error;
+
+               if (qcow_decompress_buffer(q->cluster_cache, q->cluster_size,
+                                       q->cluster_data, csize) < 0)
+                       goto out_error;
+
+               memcpy(dst, q->cluster_cache + clust_offset, length);
+               mutex_unlock(&q->mutex);
+       } else {
+               if (!clust_start)
+                       goto zero_cluster;
+
+               mutex_unlock(&q->mutex);
+
+               if (pread_in_full(q->fd, dst, length,
+                                 clust_start + clust_offset) < 0)
+                       return -1;
+       }
+
+       return length;
+
+zero_cluster:
+       mutex_unlock(&q->mutex);
+       memset(dst, 0, length);
+       return length;
+
+out_error:
+       mutex_unlock(&q->mutex);
+       length = -1;
+       return -1;
+}
+
+static ssize_t qcow2_read_cluster(struct qcow *q, u64 offset,
+       void *dst, u32 dst_len)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_l1_table *l1t = &q->table;
+       struct qcow_l2_table *l2t;
+       u64 clust_offset;
+       u64 clust_start;
+       u64 l2t_offset;
+       size_t length;
+       u64 l2t_size;
+       u64 l1_idx;
+       u64 l2_idx;
+       int coffset;
+       int sector_offset;
+       int nb_csectors;
+       int csize;
+
+       l1_idx = get_l1_index(q, offset);
+       if (l1_idx >= l1t->table_size)
+               return -1;
+
+       clust_offset = get_cluster_offset(q, offset);
+       if (clust_offset >= q->cluster_size)
+               return -1;
+
+       length = q->cluster_size - clust_offset;
+       if (length > dst_len)
+               length = dst_len;
+
+       mutex_lock(&q->mutex);
+
+       l2t_offset = be64_to_cpu(l1t->l1_table[l1_idx]);
+
+       l2t_offset &= ~QCOW2_OFLAG_COPIED;
+       if (!l2t_offset)
+               goto zero_cluster;
+
+       l2t_size = 1 << header->l2_bits;
+
+       /* read and cache level 2 table */
+       l2t = qcow_read_l2_table(q, l2t_offset);
+       if (!l2t)
+               goto out_error;
+
+       l2_idx = get_l2_index(q, offset);
+       if (l2_idx >= l2t_size)
+               goto out_error;
+
+       clust_start = be64_to_cpu(l2t->table[l2_idx]);
+       if (clust_start & QCOW2_OFLAG_COMPRESSED) {
+               coffset = clust_start & q->cluster_offset_mask;
+               nb_csectors = ((clust_start >> q->csize_shift)
+                       & q->csize_mask) + 1;
+               sector_offset = coffset & (SECTOR_SIZE - 1);
+               csize = nb_csectors * SECTOR_SIZE - sector_offset;
+
+               if (pread_in_full(q->fd, q->cluster_data,
+                                 nb_csectors * SECTOR_SIZE,
+                                 coffset & ~(SECTOR_SIZE - 1)) < 0) {
+                       goto out_error;
+               }
+
+               if (qcow_decompress_buffer(q->cluster_cache, q->cluster_size,
+                                       q->cluster_data + sector_offset,
+                                       csize) < 0) {
+                       goto out_error;
+               }
+
+               memcpy(dst, q->cluster_cache + clust_offset, length);
+               mutex_unlock(&q->mutex);
+       } else {
+               clust_start &= QCOW2_OFFSET_MASK;
+               if (!clust_start)
+                       goto zero_cluster;
+
+               mutex_unlock(&q->mutex);
+
+               if (pread_in_full(q->fd, dst, length,
+                                 clust_start + clust_offset) < 0)
+                       return -1;
+       }
+
+       return length;
+
+zero_cluster:
+       mutex_unlock(&q->mutex);
+       memset(dst, 0, length);
+       return length;
+
+out_error:
+       mutex_unlock(&q->mutex);
+       length = -1;
+       return -1;
+}
+
+static ssize_t qcow_read_sector_single(struct disk_image *disk, u64 sector,
+       void *dst, u32 dst_len)
+{
+       struct qcow *q = disk->priv;
+       struct qcow_header *header = q->header;
+       u32 nr_read;
+       u64 offset;
+       char *buf;
+       u32 nr;
+
+       buf = dst;
+       nr_read = 0;
+
+       while (nr_read < dst_len) {
+               offset = sector << SECTOR_SHIFT;
+               if (offset >= header->size)
+                       return -1;
+
+               if (q->version == QCOW1_VERSION)
+                       nr = qcow1_read_cluster(q, offset, buf,
+                               dst_len - nr_read);
+               else
+                       nr = qcow2_read_cluster(q, offset, buf,
+                               dst_len - nr_read);
+
+               if (nr <= 0)
+                       return -1;
+
+               nr_read += nr;
+               buf     += nr;
+               sector  += (nr >> SECTOR_SHIFT);
+       }
+
+       return dst_len;
+}
+
+static ssize_t qcow_read_sector(struct disk_image *disk, u64 sector,
+                               const struct iovec *iov, int iovcount, void *param)
+{
+       ssize_t nr, total = 0;
+
+       while (iovcount--) {
+               nr = qcow_read_sector_single(disk, sector, iov->iov_base, iov->iov_len);
+               if (nr != (ssize_t)iov->iov_len) {
+                       pr_info("qcow_read_sector error: nr=%ld iov_len=%ld\n", (long)nr, (long)iov->iov_len);
+                       return -1;
+               }
+
+               sector += iov->iov_len >> SECTOR_SHIFT;
+               total += nr;
+               iov++;
+       }
+
+       return total;
+}
+
+static void refcount_table_free_cache(struct qcow_refcount_table *rft)
+{
+       struct rb_root *r = &rft->root;
+       struct list_head *pos, *n;
+       struct qcow_refcount_block *t;
+
+       list_for_each_safe(pos, n, &rft->lru_list) {
+               list_del(pos);
+               t = list_entry(pos, struct qcow_refcount_block, list);
+               rb_erase(&t->node, r);
+
+               free(t);
+       }
+}
+
+static int refcount_block_insert(struct rb_root *root, struct qcow_refcount_block *new)
+{
+       struct rb_node **link = &(root->rb_node), *parent = NULL;
+       u64 offset = new->offset;
+
+       /* search the tree */
+       while (*link) {
+               struct qcow_refcount_block *t;
+
+               t = rb_entry(*link, struct qcow_refcount_block, node);
+               if (!t)
+                       goto error;
+
+               parent = *link;
+
+               if (t->offset > offset)
+                       link = &(*link)->rb_left;
+               else if (t->offset < offset)
+                       link = &(*link)->rb_right;
+               else
+                       goto out;
+       }
+
+       /* add new node */
+       rb_link_node(&new->node, parent, link);
+       rb_insert_color(&new->node, root);
+out:
+       return 0;
+error:
+       return -1;
+}
+
+static int write_refcount_block(struct qcow *q, struct qcow_refcount_block *rfb)
+{
+       if (!rfb->dirty)
+               return 0;
+
+       if (qcow_pwrite_sync(q->fd, rfb->entries,
+               rfb->size * sizeof(u16), rfb->offset) < 0)
+               return -1;
+
+       rfb->dirty = 0;
+
+       return 0;
+}
+
+static int cache_refcount_block(struct qcow *q, struct qcow_refcount_block *c)
+{
+       struct qcow_refcount_table *rft = &q->refcount_table;
+       struct rb_root *r = &rft->root;
+       struct qcow_refcount_block *lru;
+
+       if (rft->nr_cached == MAX_CACHE_NODES) {
+               lru = list_first_entry(&rft->lru_list, struct qcow_refcount_block, list);
+
+               rb_erase(&lru->node, r);
+               list_del_init(&lru->list);
+               rft->nr_cached--;
+
+               free(lru);
+       }
+
+       if (refcount_block_insert(r, c) < 0)
+               goto error;
+
+       list_add_tail(&c->list, &rft->lru_list);
+       rft->nr_cached++;
+
+       return 0;
+error:
+       return -1;
+}
+
+static struct qcow_refcount_block *new_refcount_block(struct qcow *q, u64 rfb_offset)
+{
+       struct qcow_refcount_block *rfb;
+
+       rfb = malloc(sizeof *rfb + q->cluster_size);
+       if (!rfb)
+               return NULL;
+
+       rfb->offset = rfb_offset;
+       rfb->size = q->cluster_size / sizeof(u16);
+       RB_CLEAR_NODE(&rfb->node);
+       INIT_LIST_HEAD(&rfb->list);
+
+       return rfb;
+}
+
+static struct qcow_refcount_block *refcount_block_lookup(struct rb_root *root, u64 offset)
+{
+       struct rb_node *link = root->rb_node;
+
+       while (link) {
+               struct qcow_refcount_block *t;
+
+               t = rb_entry(link, struct qcow_refcount_block, node);
+               if (!t)
+                       goto out;
+
+               if (t->offset > offset)
+                       link = link->rb_left;
+               else if (t->offset < offset)
+                       link = link->rb_right;
+               else
+                       return t;
+       }
+out:
+       return NULL;
+}
+
+static struct qcow_refcount_block *refcount_block_search(struct qcow *q, u64 offset)
+{
+       struct qcow_refcount_table *rft = &q->refcount_table;
+       struct qcow_refcount_block *rfb;
+
+       rfb = refcount_block_lookup(&rft->root, offset);
+       if (!rfb)
+               return NULL;
+
+       /* Update the LRU state, by moving the searched node to list tail */
+       list_move_tail(&rfb->list, &rft->lru_list);
+
+       return rfb;
+}
+
+static struct qcow_refcount_block *qcow_grow_refcount_block(struct qcow *q,
+       u64 clust_idx)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_refcount_table *rft = &q->refcount_table;
+       struct qcow_refcount_block *rfb;
+       u64 new_block_offset;
+       u64 rft_idx;
+
+       rft_idx = clust_idx >> (header->cluster_bits -
+               QCOW_REFCOUNT_BLOCK_SHIFT);
+
+       if (rft_idx >= rft->rf_size) {
+               pr_warning("Don't support grow refcount block table");
+               return NULL;
+       }
+
+       new_block_offset = qcow_alloc_clusters(q, q->cluster_size, 0);
+       if (new_block_offset < 0)
+               return NULL;
+
+       rfb = new_refcount_block(q, new_block_offset);
+       if (!rfb)
+               return NULL;
+
+       memset(rfb->entries, 0x00, q->cluster_size);
+       rfb->dirty = 1;
+
+       /* write refcount block */
+       if (write_refcount_block(q, rfb) < 0)
+               goto free_rfb;
+
+       if (cache_refcount_block(q, rfb) < 0)
+               goto free_rfb;
+
+       rft->rf_table[rft_idx] = cpu_to_be64(new_block_offset);
+       if (update_cluster_refcount(q, new_block_offset >>
+                   header->cluster_bits, 1) < 0)
+               goto recover_rft;
+
+       if (qcow_write_refcount_table(q) < 0)
+               goto recover_rft;
+
+       return rfb;
+
+recover_rft:
+       rft->rf_table[rft_idx] = 0;
+free_rfb:
+       free(rfb);
+       return NULL;
+}
+
+static struct qcow_refcount_block *qcow_read_refcount_block(struct qcow *q, u64 clust_idx)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_refcount_table *rft = &q->refcount_table;
+       struct qcow_refcount_block *rfb;
+       u64 rfb_offset;
+       u64 rft_idx;
+
+       rft_idx = clust_idx >> (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT);
+       if (rft_idx >= rft->rf_size)
+               return ERR_PTR(-ENOSPC);
+
+       rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]);
+       if (!rfb_offset)
+               return ERR_PTR(-ENOSPC);
+
+       rfb = refcount_block_search(q, rfb_offset);
+       if (rfb)
+               return rfb;
+
+       rfb = new_refcount_block(q, rfb_offset);
+       if (!rfb)
+               return NULL;
+
+       if (pread_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), rfb_offset) < 0)
+               goto error_free_rfb;
+
+       if (cache_refcount_block(q, rfb) < 0)
+               goto error_free_rfb;
+
+       return rfb;
+
+error_free_rfb:
+       free(rfb);
+
+       return NULL;
+}
+
+static u16 qcow_get_refcount(struct qcow *q, u64 clust_idx)
+{
+       struct qcow_refcount_block *rfb = NULL;
+       struct qcow_header *header = q->header;
+       u64 rfb_idx;
+
+       rfb = qcow_read_refcount_block(q, clust_idx);
+       if (PTR_ERR(rfb) == -ENOSPC)
+               return 0;
+       else if (IS_ERR_OR_NULL(rfb)) {
+               pr_warning("Error while reading refcount table");
+               return -1;
+       }
+
+       rfb_idx = clust_idx & (((1ULL <<
+               (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+
+       if (rfb_idx >= rfb->size) {
+               pr_warning("L1: refcount block index out of bounds");
+               return -1;
+       }
+
+       return be16_to_cpu(rfb->entries[rfb_idx]);
+}
+
+static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append)
+{
+       struct qcow_refcount_block *rfb = NULL;
+       struct qcow_header *header = q->header;
+       u16 refcount;
+       u64 rfb_idx;
+
+       rfb = qcow_read_refcount_block(q, clust_idx);
+       if (PTR_ERR(rfb) == -ENOSPC) {
+               rfb = qcow_grow_refcount_block(q, clust_idx);
+               if (!rfb) {
+                       pr_warning("error while growing refcount table");
+                       return -1;
+               }
+       } else if (IS_ERR_OR_NULL(rfb)) {
+               pr_warning("error while reading refcount table");
+               return -1;
+       }
+
+       rfb_idx = clust_idx & (((1ULL <<
+               (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+       if (rfb_idx >= rfb->size) {
+               pr_warning("refcount block index out of bounds");
+               return -1;
+       }
+
+       refcount = be16_to_cpu(rfb->entries[rfb_idx]) + append;
+       rfb->entries[rfb_idx] = cpu_to_be16(refcount);
+       rfb->dirty = 1;
+
+       /* write refcount block */
+       if (write_refcount_block(q, rfb) < 0) {
+               pr_warning("refcount block index out of bounds");
+               return -1;
+       }
+
+       /* update free_clust_idx since refcount becomes zero */
+       if (!refcount && clust_idx < q->free_clust_idx)
+               q->free_clust_idx = clust_idx;
+
+       return 0;
+}
+
+static void  qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size)
+{
+       struct qcow_header *header = q->header;
+       u64 start, end, offset;
+
+       start = clust_start & ~(q->cluster_size - 1);
+       end = (clust_start + size - 1) & ~(q->cluster_size - 1);
+       for (offset = start; offset <= end; offset += q->cluster_size)
+               update_cluster_refcount(q, offset >> header->cluster_bits, -1);
+}
+
+/*
+ * Allocate clusters according to the size. Find a postion that
+ * can satisfy the size. free_clust_idx is initialized to zero and
+ * Record last position.
+ */
+static u64 qcow_alloc_clusters(struct qcow *q, u64 size, int update_ref)
+{
+       struct qcow_header *header = q->header;
+       u16 clust_refcount;
+       u32 clust_idx = 0, i;
+       u64 clust_num;
+
+       clust_num = (size + (q->cluster_size - 1)) >> header->cluster_bits;
+
+again:
+       for (i = 0; i < clust_num; i++) {
+               clust_idx = q->free_clust_idx++;
+               clust_refcount = qcow_get_refcount(q, clust_idx);
+               if (clust_refcount < 0)
+                       return -1;
+               else if (clust_refcount > 0)
+                       goto again;
+       }
+
+       clust_idx++;
+
+       if (update_ref)
+               for (i = 0; i < clust_num; i++)
+                       if (update_cluster_refcount(q,
+                               clust_idx - clust_num + i, 1))
+                               return -1;
+
+       return (clust_idx - clust_num) << header->cluster_bits;
+}
+
+static int qcow_write_l1_table(struct qcow *q)
+{
+       struct qcow_l1_table *l1t = &q->table;
+       struct qcow_header *header = q->header;
+
+       if (qcow_pwrite_sync(q->fd, l1t->l1_table,
+               l1t->table_size * sizeof(u64),
+               header->l1_table_offset) < 0)
+               return -1;
+
+       return 0;
+}
+
+/*
+ * Get l2 table. If the table has been copied, read table directly.
+ * If the table exists, allocate a new cluster and copy the table
+ * to the new cluster.
+ */
+static int get_cluster_table(struct qcow *q, u64 offset,
+       struct qcow_l2_table **result_l2t, u64 *result_l2_index)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_l1_table *l1t = &q->table;
+       struct qcow_l2_table *l2t;
+       u64 l1t_idx;
+       u64 l2t_offset;
+       u64 l2t_idx;
+       u64 l2t_size;
+       u64 l2t_new_offset;
+
+       l2t_size = 1 << header->l2_bits;
+
+       l1t_idx = get_l1_index(q, offset);
+       if (l1t_idx >= l1t->table_size)
+               return -1;
+
+       l2t_idx = get_l2_index(q, offset);
+       if (l2t_idx >= l2t_size)
+               return -1;
+
+       l2t_offset = be64_to_cpu(l1t->l1_table[l1t_idx]);
+       if (l2t_offset & QCOW2_OFLAG_COPIED) {
+               l2t_offset &= ~QCOW2_OFLAG_COPIED;
+               l2t = qcow_read_l2_table(q, l2t_offset);
+               if (!l2t)
+                       goto error;
+       } else {
+               l2t_new_offset = qcow_alloc_clusters(q,
+                       l2t_size*sizeof(u64), 1);
+
+               if (l2t_new_offset < 0)
+                       goto error;
+
+               l2t = new_cache_table(q, l2t_new_offset);
+               if (!l2t)
+                       goto free_cluster;
+
+               if (l2t_offset) {
+                       l2t = qcow_read_l2_table(q, l2t_offset);
+                       if (!l2t)
+                               goto free_cache;
+               } else
+                       memset(l2t->table, 0x00, l2t_size * sizeof(u64));
+
+               /* write l2 table */
+               l2t->dirty = 1;
+               if (qcow_l2_cache_write(q, l2t) < 0)
+                       goto free_cache;
+
+               /* cache l2 table */
+               if (cache_table(q, l2t))
+                       goto free_cache;
+
+               /* update the l1 talble */
+               l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_new_offset
+                       | QCOW2_OFLAG_COPIED);
+               if (qcow_write_l1_table(q)) {
+                       pr_warning("Update l1 table error");
+                       goto free_cache;
+               }
+
+               /* free old cluster */
+               qcow_free_clusters(q, l2t_offset, q->cluster_size);
+       }
+
+       *result_l2t = l2t;
+       *result_l2_index = l2t_idx;
+
+       return 0;
+
+free_cache:
+       free(l2t);
+
+free_cluster:
+       qcow_free_clusters(q, l2t_new_offset, q->cluster_size);
+
+error:
+       return -1;
+}
+
+/*
+ * If the cluster has been copied, write data directly. If not,
+ * read the original data and write it to the new cluster with
+ * modification.
+ */
+static ssize_t qcow_write_cluster(struct qcow *q, u64 offset,
+               void *buf, u32 src_len)
+{
+       struct qcow_l2_table *l2t;
+       u64 clust_new_start;
+       u64 clust_start;
+       u64 clust_flags;
+       u64 clust_off;
+       u64 l2t_idx;
+       u64 len;
+
+       l2t = NULL;
+
+       clust_off = get_cluster_offset(q, offset);
+       if (clust_off >= q->cluster_size)
+               return -1;
+
+       len = q->cluster_size - clust_off;
+       if (len > src_len)
+               len = src_len;
+
+       mutex_lock(&q->mutex);
+
+       if (get_cluster_table(q, offset, &l2t, &l2t_idx)) {
+               pr_warning("Get l2 table error");
+               goto error;
+       }
+
+       clust_start = be64_to_cpu(l2t->table[l2t_idx]);
+       clust_flags = clust_start & QCOW2_OFLAGS_MASK;
+
+       clust_start &= QCOW2_OFFSET_MASK;
+       if (!(clust_flags & QCOW2_OFLAG_COPIED)) {
+               clust_new_start = qcow_alloc_clusters(q, q->cluster_size, 1);
+               if (clust_new_start < 0) {
+                       pr_warning("Cluster alloc error");
+                       goto error;
+               }
+
+               offset &= ~(q->cluster_size - 1);
+
+               /* if clust_start is not zero, read the original data*/
+               if (clust_start) {
+                       mutex_unlock(&q->mutex);
+                       if (qcow2_read_cluster(q, offset, q->copy_buff,
+                               q->cluster_size) < 0) {
+                               pr_warning("Read copy cluster error");
+                               qcow_free_clusters(q, clust_new_start,
+                                       q->cluster_size);
+                               return -1;
+                       }
+                       mutex_lock(&q->mutex);
+               } else
+                       memset(q->copy_buff, 0x00, q->cluster_size);
+
+               memcpy(q->copy_buff + clust_off, buf, len);
+
+                /* Write actual data */
+               if (pwrite_in_full(q->fd, q->copy_buff, q->cluster_size,
+                       clust_new_start) < 0)
+                       goto free_cluster;
+
+               /* update l2 table*/
+               l2t->table[l2t_idx] = cpu_to_be64(clust_new_start
+                       | QCOW2_OFLAG_COPIED);
+               l2t->dirty = 1;
+
+               if (qcow_l2_cache_write(q, l2t))
+                       goto free_cluster;
+
+               /* free old cluster*/
+               if (clust_flags & QCOW2_OFLAG_COMPRESSED) {
+                       int size;
+                       size = ((clust_start >> q->csize_shift) &
+                               q->csize_mask) + 1;
+                       size *= 512;
+                       clust_start &= q->cluster_offset_mask;
+                       clust_start &= ~511;
+
+                       qcow_free_clusters(q, clust_start, size);
+               } else if (clust_start)
+                       qcow_free_clusters(q, clust_start, q->cluster_size);
+
+       } else {
+               /* Write actual data */
+               if (pwrite_in_full(q->fd, buf, len,
+                       clust_start + clust_off) < 0)
+                       goto error;
+       }
+       mutex_unlock(&q->mutex);
+       return len;
+
+free_cluster:
+       qcow_free_clusters(q, clust_new_start, q->cluster_size);
+
+error:
+       mutex_unlock(&q->mutex);
+       return -1;
+}
+
+static ssize_t qcow_write_sector_single(struct disk_image *disk, u64 sector, void *src, u32 src_len)
+{
+       struct qcow *q = disk->priv;
+       struct qcow_header *header = q->header;
+       u32 nr_written;
+       char *buf;
+       u64 offset;
+       ssize_t nr;
+
+       buf             = src;
+       nr_written      = 0;
+       offset          = sector << SECTOR_SHIFT;
+
+       while (nr_written < src_len) {
+               if (offset >= header->size)
+                       return -1;
+
+               nr = qcow_write_cluster(q, offset, buf, src_len - nr_written);
+               if (nr < 0)
+                       return -1;
+
+               nr_written      += nr;
+               buf             += nr;
+               offset          += nr;
+       }
+
+       return nr_written;
+}
+
+static ssize_t qcow_write_sector(struct disk_image *disk, u64 sector,
+                               const struct iovec *iov, int iovcount, void *param)
+{
+       ssize_t nr, total = 0;
+
+       while (iovcount--) {
+               nr = qcow_write_sector_single(disk, sector, iov->iov_base, iov->iov_len);
+               if (nr != (ssize_t)iov->iov_len) {
+                       pr_info("qcow_write_sector error: nr=%ld iov_len=%ld\n", (long)nr, (long)iov->iov_len);
+                       return -1;
+               }
+
+               sector  += iov->iov_len >> SECTOR_SHIFT;
+               iov++;
+               total   += nr;
+       }
+
+       return total;
+}
+
+static int qcow_disk_flush(struct disk_image *disk)
+{
+       struct qcow *q = disk->priv;
+       struct qcow_refcount_table *rft;
+       struct list_head *pos, *n;
+       struct qcow_l1_table *l1t;
+
+       l1t = &q->table;
+       rft = &q->refcount_table;
+
+       mutex_lock(&q->mutex);
+
+       list_for_each_safe(pos, n, &rft->lru_list) {
+               struct qcow_refcount_block *c = list_entry(pos, struct qcow_refcount_block, list);
+
+               if (write_refcount_block(q, c) < 0)
+                       goto error_unlock;
+       }
+
+       list_for_each_safe(pos, n, &l1t->lru_list) {
+               struct qcow_l2_table *c = list_entry(pos, struct qcow_l2_table, list);
+
+               if (qcow_l2_cache_write(q, c) < 0)
+                       goto error_unlock;
+       }
+
+       if (qcow_write_l1_table < 0)
+               goto error_unlock;
+
+       mutex_unlock(&q->mutex);
+
+       return fsync(disk->fd);
+
+error_unlock:
+       mutex_unlock(&q->mutex);
+       return -1;
+}
+
+static int qcow_disk_close(struct disk_image *disk)
+{
+       struct qcow *q;
+
+       if (!disk)
+               return 0;
+
+       q = disk->priv;
+
+       refcount_table_free_cache(&q->refcount_table);
+       l1_table_free_cache(&q->table);
+       free(q->copy_buff);
+       free(q->cluster_data);
+       free(q->cluster_cache);
+       free(q->refcount_table.rf_table);
+       free(q->table.l1_table);
+       free(q->header);
+       free(q);
+
+       return 0;
+}
+
+static struct disk_image_operations qcow_disk_readonly_ops = {
+       .read   = qcow_read_sector,
+       .close  = qcow_disk_close,
+};
+
+static struct disk_image_operations qcow_disk_ops = {
+       .read   = qcow_read_sector,
+       .write  = qcow_write_sector,
+       .flush  = qcow_disk_flush,
+       .close  = qcow_disk_close,
+};
+
+static int qcow_read_refcount_table(struct qcow *q)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_refcount_table *rft = &q->refcount_table;
+
+       rft->rf_size = (header->refcount_table_size * q->cluster_size)
+               / sizeof(u64);
+
+       rft->rf_table = calloc(rft->rf_size, sizeof(u64));
+       if (!rft->rf_table)
+               return -1;
+
+       rft->root = RB_ROOT;
+       INIT_LIST_HEAD(&rft->lru_list);
+
+       return pread_in_full(q->fd, rft->rf_table, sizeof(u64) * rft->rf_size, header->refcount_table_offset);
+}
+
+static int qcow_write_refcount_table(struct qcow *q)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_refcount_table *rft = &q->refcount_table;
+
+       return qcow_pwrite_sync(q->fd, rft->rf_table,
+               rft->rf_size * sizeof(u64), header->refcount_table_offset);
+}
+
+static int qcow_read_l1_table(struct qcow *q)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_l1_table *table = &q->table;
+
+       table->table_size = header->l1_size;
+
+       table->l1_table = calloc(table->table_size, sizeof(u64));
+       if (!table->l1_table)
+               return -1;
+
+       return pread_in_full(q->fd, table->l1_table, sizeof(u64) * table->table_size, header->l1_table_offset);
+}
+
+static void *qcow2_read_header(int fd)
+{
+       struct qcow2_header_disk f_header;
+       struct qcow_header *header;
+
+       header = malloc(sizeof(struct qcow_header));
+       if (!header)
+               return NULL;
+
+       if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0) {
+               free(header);
+               return NULL;
+       }
+
+       be32_to_cpus(&f_header.magic);
+       be32_to_cpus(&f_header.version);
+       be64_to_cpus(&f_header.backing_file_offset);
+       be32_to_cpus(&f_header.backing_file_size);
+       be32_to_cpus(&f_header.cluster_bits);
+       be64_to_cpus(&f_header.size);
+       be32_to_cpus(&f_header.crypt_method);
+       be32_to_cpus(&f_header.l1_size);
+       be64_to_cpus(&f_header.l1_table_offset);
+       be64_to_cpus(&f_header.refcount_table_offset);
+       be32_to_cpus(&f_header.refcount_table_clusters);
+       be32_to_cpus(&f_header.nb_snapshots);
+       be64_to_cpus(&f_header.snapshots_offset);
+
+       *header         = (struct qcow_header) {
+               .size                   = f_header.size,
+               .l1_table_offset        = f_header.l1_table_offset,
+               .l1_size                = f_header.l1_size,
+               .cluster_bits           = f_header.cluster_bits,
+               .l2_bits                = f_header.cluster_bits - 3,
+               .refcount_table_offset  = f_header.refcount_table_offset,
+               .refcount_table_size    = f_header.refcount_table_clusters,
+       };
+
+       return header;
+}
+
+static struct disk_image *qcow2_probe(int fd, bool readonly)
+{
+       struct disk_image *disk_image;
+       struct qcow_l1_table *l1t;
+       struct qcow_header *h;
+       struct qcow *q;
+
+       q = calloc(1, sizeof(struct qcow));
+       if (!q)
+               return NULL;
+
+       mutex_init(&q->mutex);
+       q->fd = fd;
+
+       l1t = &q->table;
+
+       l1t->root = RB_ROOT;
+       INIT_LIST_HEAD(&l1t->lru_list);
+
+       h = q->header = qcow2_read_header(fd);
+       if (!h)
+               goto free_qcow;
+
+       q->version = QCOW2_VERSION;
+       q->csize_shift = (62 - (q->header->cluster_bits - 8));
+       q->csize_mask = (1 << (q->header->cluster_bits - 8)) - 1;
+       q->cluster_offset_mask = (1LL << q->csize_shift) - 1;
+       q->cluster_size = 1 << q->header->cluster_bits;
+
+       q->copy_buff = malloc(q->cluster_size);
+       if (!q->copy_buff) {
+               pr_warning("copy buff malloc error");
+               goto free_header;
+       }
+
+       q->cluster_data = malloc(q->cluster_size);
+       if (!q->cluster_data) {
+               pr_warning("cluster data malloc error");
+               goto free_copy_buff;
+       }
+
+       q->cluster_cache = malloc(q->cluster_size);
+       if (!q->cluster_cache) {
+               pr_warning("cluster cache malloc error");
+               goto free_cluster_data;
+       }
+
+       if (qcow_read_l1_table(q) < 0)
+               goto free_cluster_cache;
+
+       if (qcow_read_refcount_table(q) < 0)
+               goto free_l1_table;
+
+       /*
+        * Do not use mmap use read/write instead
+        */
+       if (readonly)
+               disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_REGULAR);
+       else
+               disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_REGULAR);
+
+       if (IS_ERR_OR_NULL(disk_image))
+               goto free_refcount_table;
+
+       disk_image->async = 0;
+       disk_image->priv = q;
+
+       return disk_image;
+
+free_refcount_table:
+       if (q->refcount_table.rf_table)
+               free(q->refcount_table.rf_table);
+free_l1_table:
+       if (q->table.l1_table)
+               free(q->table.l1_table);
+free_cluster_cache:
+       if (q->cluster_cache)
+               free(q->cluster_cache);
+free_cluster_data:
+       if (q->cluster_data)
+               free(q->cluster_data);
+free_copy_buff:
+       if (q->copy_buff)
+               free(q->copy_buff);
+free_header:
+       if (q->header)
+               free(q->header);
+free_qcow:
+       if (q)
+               free(q);
+
+       return NULL;
+}
+
+static bool qcow2_check_image(int fd)
+{
+       struct qcow2_header_disk f_header;
+
+       if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0)
+               return false;
+
+       be32_to_cpus(&f_header.magic);
+       be32_to_cpus(&f_header.version);
+
+       if (f_header.magic != QCOW_MAGIC)
+               return false;
+
+       if (f_header.version != QCOW2_VERSION)
+               return false;
+
+       return true;
+}
+
+static void *qcow1_read_header(int fd)
+{
+       struct qcow1_header_disk f_header;
+       struct qcow_header *header;
+
+       header = malloc(sizeof(struct qcow_header));
+       if (!header)
+               return NULL;
+
+       if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0) {
+               free(header);
+               return NULL;
+       }
+
+       be32_to_cpus(&f_header.magic);
+       be32_to_cpus(&f_header.version);
+       be64_to_cpus(&f_header.backing_file_offset);
+       be32_to_cpus(&f_header.backing_file_size);
+       be32_to_cpus(&f_header.mtime);
+       be64_to_cpus(&f_header.size);
+       be32_to_cpus(&f_header.crypt_method);
+       be64_to_cpus(&f_header.l1_table_offset);
+
+       *header         = (struct qcow_header) {
+               .size                   = f_header.size,
+               .l1_table_offset        = f_header.l1_table_offset,
+               .l1_size                = f_header.size / ((1 << f_header.l2_bits) * (1 << f_header.cluster_bits)),
+               .cluster_bits           = f_header.cluster_bits,
+               .l2_bits                = f_header.l2_bits,
+       };
+
+       return header;
+}
+
+static struct disk_image *qcow1_probe(int fd, bool readonly)
+{
+       struct disk_image *disk_image;
+       struct qcow_l1_table *l1t;
+       struct qcow_header *h;
+       struct qcow *q;
+
+       q = calloc(1, sizeof(struct qcow));
+       if (!q)
+               return NULL;
+
+       mutex_init(&q->mutex);
+       q->fd = fd;
+
+       l1t = &q->table;
+
+       l1t->root = RB_ROOT;
+       INIT_LIST_HEAD(&l1t->lru_list);
+
+       h = q->header = qcow1_read_header(fd);
+       if (!h)
+               goto free_qcow;
+
+       q->version = QCOW1_VERSION;
+       q->cluster_size = 1 << q->header->cluster_bits;
+       q->cluster_offset_mask = (1LL << (63 - q->header->cluster_bits)) - 1;
+       q->free_clust_idx = 0;
+
+       q->cluster_data = malloc(q->cluster_size);
+       if (!q->cluster_data) {
+               pr_warning("cluster data malloc error");
+               goto free_header;
+       }
+
+       q->cluster_cache = malloc(q->cluster_size);
+       if (!q->cluster_cache) {
+               pr_warning("cluster cache malloc error");
+               goto free_cluster_data;
+       }
+
+       if (qcow_read_l1_table(q) < 0)
+               goto free_cluster_cache;
+
+       /*
+        * Do not use mmap use read/write instead
+        */
+       if (readonly)
+               disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_REGULAR);
+       else
+               disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_REGULAR);
+
+       if (!disk_image)
+               goto free_l1_table;
+
+       disk_image->async = 1;
+       disk_image->priv = q;
+
+       return disk_image;
+
+free_l1_table:
+       if (q->table.l1_table)
+               free(q->table.l1_table);
+free_cluster_cache:
+       if (q->cluster_cache)
+               free(q->cluster_cache);
+free_cluster_data:
+       if (q->cluster_data)
+               free(q->cluster_data);
+free_header:
+       if (q->header)
+               free(q->header);
+free_qcow:
+       if (q)
+               free(q);
+
+       return NULL;
+}
+
+static bool qcow1_check_image(int fd)
+{
+       struct qcow1_header_disk f_header;
+
+       if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0)
+               return false;
+
+       be32_to_cpus(&f_header.magic);
+       be32_to_cpus(&f_header.version);
+
+       if (f_header.magic != QCOW_MAGIC)
+               return false;
+
+       if (f_header.version != QCOW1_VERSION)
+               return false;
+
+       return true;
+}
+
+struct disk_image *qcow_probe(int fd, bool readonly)
+{
+       if (qcow1_check_image(fd))
+               return qcow1_probe(fd, readonly);
+
+       if (qcow2_check_image(fd))
+               return qcow2_probe(fd, readonly);
+
+       return NULL;
+}
diff --git a/tools/kvm/disk/raw.c b/tools/kvm/disk/raw.c

new file mode 100644 (file)

index 0000000..93b2b4e
--- /dev/null
+++ b/tools/kvm/disk/raw.c
@@ -0,0 +1,141 @@
+#include "kvm/disk-image.h"
+
+#include <linux/err.h>
+
+#ifdef CONFIG_HAS_AIO
+#include <libaio.h>
+#endif
+
+ssize_t raw_image__read(struct disk_image *disk, u64 sector, const struct iovec *iov,
+                               int iovcount, void *param)
+{
+       u64 offset = sector << SECTOR_SHIFT;
+
+#ifdef CONFIG_HAS_AIO
+       struct iocb iocb;
+
+       return aio_preadv(disk->ctx, &iocb, disk->fd, iov, iovcount, offset,
+                               disk->evt, param);
+#else
+       return preadv_in_full(disk->fd, iov, iovcount, offset);
+#endif
+}
+
+ssize_t raw_image__write(struct disk_image *disk, u64 sector, const struct iovec *iov,
+                               int iovcount, void *param)
+{
+       u64 offset = sector << SECTOR_SHIFT;
+
+#ifdef CONFIG_HAS_AIO
+       struct iocb iocb;
+
+       return aio_pwritev(disk->ctx, &iocb, disk->fd, iov, iovcount, offset,
+                               disk->evt, param);
+#else
+       return pwritev_in_full(disk->fd, iov, iovcount, offset);
+#endif
+}
+
+ssize_t raw_image__read_mmap(struct disk_image *disk, u64 sector, const struct iovec *iov,
+                               int iovcount, void *param)
+{
+       u64 offset = sector << SECTOR_SHIFT;
+       ssize_t total = 0;
+
+       while (iovcount--) {
+               memcpy(iov->iov_base, disk->priv + offset, iov->iov_len);
+
+               sector  += iov->iov_len >> SECTOR_SHIFT;
+               offset  += iov->iov_len;
+               total   += iov->iov_len;
+               iov++;
+       }
+
+       return total;
+}
+
+ssize_t raw_image__write_mmap(struct disk_image *disk, u64 sector, const struct iovec *iov,
+                               int iovcount, void *param)
+{
+       u64 offset = sector << SECTOR_SHIFT;
+       ssize_t total = 0;
+
+       while (iovcount--) {
+               memcpy(disk->priv + offset, iov->iov_base, iov->iov_len);
+
+               sector  += iov->iov_len >> SECTOR_SHIFT;
+               offset  += iov->iov_len;
+               total   += iov->iov_len;
+               iov++;
+       }
+
+       return total;
+}
+
+int raw_image__close(struct disk_image *disk)
+{
+       int ret = 0;
+
+       if (disk->priv != MAP_FAILED)
+               ret = munmap(disk->priv, disk->size);
+
+       close(disk->evt);
+
+#ifdef CONFIG_HAS_VIRTIO
+       io_destroy(disk->ctx);
+#endif
+
+       return ret;
+}
+
+/*
+ * multiple buffer based disk image operations
+ */
+static struct disk_image_operations raw_image_regular_ops = {
+       .read   = raw_image__read,
+       .write  = raw_image__write,
+};
+
+struct disk_image_operations ro_ops = {
+       .read   = raw_image__read_mmap,
+       .write  = raw_image__write_mmap,
+       .close  = raw_image__close,
+};
+
+struct disk_image_operations ro_ops_nowrite = {
+       .read   = raw_image__read,
+};
+
+struct disk_image *raw_image__probe(int fd, struct stat *st, bool readonly)
+{
+       struct disk_image *disk;
+
+       if (readonly) {
+               /*
+                * Use mmap's MAP_PRIVATE to implement non-persistent write
+                * FIXME: This does not work on 32-bit host.
+                */
+               struct disk_image *disk;
+
+               disk = disk_image__new(fd, st->st_size, &ro_ops, DISK_IMAGE_MMAP);
+               if (IS_ERR_OR_NULL(disk)) {
+                       disk = disk_image__new(fd, st->st_size, &ro_ops_nowrite, DISK_IMAGE_REGULAR);
+#ifdef CONFIG_HAS_AIO
+                       if (!IS_ERR_OR_NULL(disk))
+                               disk->async = 1;
+#endif
+               }
+
+               return disk;
+       } else {
+               /*
+                * Use read/write instead of mmap
+                */
+               disk = disk_image__new(fd, st->st_size, &raw_image_regular_ops, DISK_IMAGE_REGULAR);
+#ifdef CONFIG_HAS_AIO
+               if (!IS_ERR_OR_NULL(disk))
+                       disk->async = 1;
+#endif
+               return disk;
+       }
+}
diff --git a/tools/kvm/framebuffer.c b/tools/kvm/framebuffer.c

new file mode 100644 (file)

index 0000000..e15b717
--- /dev/null
+++ b/tools/kvm/framebuffer.c
@@ -0,0 +1,75 @@
+#include "kvm/framebuffer.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+static LIST_HEAD(framebuffers);
+
+struct framebuffer *fb__register(struct framebuffer *fb)
+{
+       INIT_LIST_HEAD(&fb->node);
+       list_add(&fb->node, &framebuffers);
+
+       return fb;
+}
+
+int fb__attach(struct framebuffer *fb, struct fb_target_operations *ops)
+{
+       if (fb->nr_targets >= FB_MAX_TARGETS)
+               return -ENOSPC;
+
+       fb->targets[fb->nr_targets++] = ops;
+
+       return 0;
+}
+
+static int start_targets(struct framebuffer *fb)
+{
+       unsigned long i;
+
+       for (i = 0; i < fb->nr_targets; i++) {
+               struct fb_target_operations *ops = fb->targets[i];
+               int err = 0;
+
+               if (ops->start)
+                       err = ops->start(fb);
+
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+int fb__start(void)
+{
+       struct framebuffer *fb;
+
+       list_for_each_entry(fb, &framebuffers, node) {
+               int err;
+
+               err = start_targets(fb);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+void fb__stop(void)
+{
+       struct framebuffer *fb;
+
+       list_for_each_entry(fb, &framebuffers, node) {
+               u32 i;
+
+               for (i = 0; i < fb->nr_targets; i++)
+                       if (fb->targets[i]->stop)
+                               fb->targets[i]->stop(fb);
+
+               munmap(fb->mem, fb->mem_size);
+       }
+}
diff --git a/tools/kvm/guest/init.c b/tools/kvm/guest/init.c

new file mode 100644 (file)

index 0000000..ece48fd
--- /dev/null
+++ b/tools/kvm/guest/init.c
@@ -0,0 +1,72 @@
+/*
+ * This is a simple init for shared rootfs guests. This part should be limited
+ * to doing mounts and running stage 2 of the init process.
+ */
+#include <sys/mount.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <errno.h>
+#include <linux/reboot.h>
+
+static int run_process(char *filename)
+{
+       char *new_argv[] = { filename, NULL };
+       char *new_env[] = { "TERM=linux", "DISPLAY=192.168.33.1:0",
+                               "HOME=/virt/home", NULL };
+
+       return execve(filename, new_argv, new_env);
+}
+
+static int run_process_sandbox(char *filename)
+{
+       char *new_argv[] = { filename, "/virt/sandbox.sh", NULL };
+       char *new_env[] = { "TERM=linux", "HOME=/virt/home", NULL };
+
+       return execve(filename, new_argv, new_env);
+}
+
+static void do_mounts(void)
+{
+       mount("hostfs", "/host", "9p", MS_RDONLY, "trans=virtio,version=9p2000.L");
+       mount("", "/sys", "sysfs", 0, NULL);
+       mount("proc", "/proc", "proc", 0, NULL);
+       mount("devtmpfs", "/dev", "devtmpfs", 0, NULL);
+       mkdir("/dev/pts", 0755);
+       mount("devpts", "/dev/pts", "devpts", 0, NULL);
+}
+
+int main(int argc, char *argv[])
+{
+       pid_t child;
+       int status;
+
+       puts("Mounting...");
+
+       do_mounts();
+
+       /* get session leader */
+       setsid();
+
+       /* set controlling terminal */
+       ioctl(0, TIOCSCTTY, 1);
+
+       child = fork();
+       if (child < 0) {
+               printf("Fatal: fork() failed with %d\n", child);
+               return 0;
+       } else if (child == 0) {
+               if (access("/virt/sandbox.sh", R_OK) == 0)
+                       run_process_sandbox("/bin/sh");
+               else
+                       run_process("/bin/sh");
+       } else {
+               waitpid(child, &status, 0);
+       }
+
+       reboot(LINUX_REBOOT_CMD_RESTART);
+
+       printf("Init failed: %s\n", strerror(errno));
+
+       return 0;
+}
diff --git a/tools/kvm/guest_compat.c b/tools/kvm/guest_compat.c

new file mode 100644 (file)

index 0000000..fd4704b
--- /dev/null
+++ b/tools/kvm/guest_compat.c
@@ -0,0 +1,99 @@
+#include "kvm/guest_compat.h"
+
+#include "kvm/mutex.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+struct compat_message {
+       int id;
+       char *title;
+       char *desc;
+
+       struct list_head list;
+};
+
+static int id;
+static DEFINE_MUTEX(compat_mtx);
+static LIST_HEAD(messages);
+
+static void compat__free(struct compat_message *msg)
+{
+       free(msg->title);
+       free(msg->desc);
+       free(msg);
+}
+
+int compat__add_message(const char *title, const char *desc)
+{
+       struct compat_message *msg;
+       int msg_id;
+
+       msg = malloc(sizeof(*msg));
+       if (msg == NULL)
+               goto cleanup;
+
+       msg->title = strdup(title);
+       msg->desc = strdup(desc);
+
+       if (msg->title == NULL || msg->desc == NULL)
+               goto cleanup;
+
+       mutex_lock(&compat_mtx);
+
+       msg->id = msg_id = id++;
+       list_add_tail(&msg->list, &messages);
+
+       mutex_unlock(&compat_mtx);
+
+       return msg_id;
+
+cleanup:
+       if (msg)
+               compat__free(msg);
+
+       return -ENOMEM;
+}
+
+int compat__remove_message(int id)
+{
+       struct compat_message *pos, *n;
+
+       mutex_lock(&compat_mtx);
+
+       list_for_each_entry_safe(pos, n, &messages, list) {
+               if (pos->id == id) {
+                       list_del(&pos->list);
+                       compat__free(pos);
+
+                       mutex_unlock(&compat_mtx);
+
+                       return 0;
+               }
+       }
+
+       mutex_unlock(&compat_mtx);
+
+       return -ENOENT;
+}
+
+int compat__print_all_messages(void)
+{
+       mutex_lock(&compat_mtx);
+
+       while (!list_empty(&messages)) {
+               struct compat_message *msg;
+
+               msg = list_first_entry(&messages, struct compat_message, list);
+
+               printf("\n  # KVM compatibility warning.\n\t%s\n\t%s\n",
+                       msg->title, msg->desc);
+
+               list_del(&msg->list);
+               compat__free(msg);
+       }
+
+       mutex_unlock(&compat_mtx);
+
+       return 0;
+}
diff --git a/tools/kvm/hw/i8042.c b/tools/kvm/hw/i8042.c

new file mode 100644 (file)

index 0000000..3a36425
--- /dev/null
+++ b/tools/kvm/hw/i8042.c
@@ -0,0 +1,348 @@
+#include "kvm/read-write.h"
+#include "kvm/ioport.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "kvm/kvm.h"
+#include "kvm/i8042.h"
+#include "kvm/kvm-cpu.h"
+
+#include <stdint.h>
+
+/*
+ * IRQs
+ */
+#define KBD_IRQ                        1
+#define AUX_IRQ                        12
+
+/*
+ * Registers
+ */
+#define I8042_DATA_REG         0x60
+#define I8042_COMMAND_REG      0x64
+
+/*
+ * Commands
+ */
+#define I8042_CMD_CTL_RCTR     0x20
+#define I8042_CMD_CTL_WCTR     0x60
+#define I8042_CMD_AUX_LOOP     0xD3
+#define I8042_CMD_AUX_SEND     0xD4
+#define I8042_CMD_AUX_TEST     0xA9
+#define I8042_CMD_AUX_DISABLE  0xA7
+#define I8042_CMD_AUX_ENABLE   0xA8
+#define I8042_CMD_SYSTEM_RESET 0xFE
+
+#define RESPONSE_ACK           0xFA
+
+#define MODE_DISABLE_AUX       0x20
+
+#define AUX_ENABLE_REPORTING   0x20
+#define AUX_SCALING_FLAG       0x10
+#define AUX_DEFAULT_RESOLUTION 0x2
+#define AUX_DEFAULT_SAMPLE     100
+
+/*
+ * Status register bits
+ */
+#define I8042_STR_AUXDATA      0x20
+#define I8042_STR_KEYLOCK      0x10
+#define I8042_STR_CMDDAT       0x08
+#define I8042_STR_MUXERR       0x04
+#define I8042_STR_OBF          0x01
+
+#define KBD_MODE_KBD_INT       0x01
+#define KBD_MODE_SYS           0x02
+
+#define QUEUE_SIZE             128
+
+/*
+ * This represents the current state of the PS/2 keyboard system,
+ * including the AUX device (the mouse)
+ */
+struct kbd_state {
+       struct kvm              *kvm;
+
+       char                    kq[QUEUE_SIZE]; /* Keyboard queue */
+       int                     kread, kwrite;  /* Indexes into the queue */
+       int                     kcount;         /* number of elements in queue */
+
+       char                    mq[QUEUE_SIZE];
+       int                     mread, mwrite;
+       int                     mcount;
+
+       u8                      mstatus;        /* Mouse status byte */
+       u8                      mres;           /* Current mouse resolution */
+       u8                      msample;        /* Current mouse samples/second */
+
+       u8                      mode;           /* i8042 mode register */
+       u8                      status;         /* i8042 status register */
+       /*
+        * Some commands (on port 0x64) have arguments;
+        * we store the command here while we wait for the argument
+        */
+       u32                     write_cmd;
+};
+
+static struct kbd_state                state;
+
+/*
+ * If there are packets to be read, set the appropriate IRQs high
+ */
+static void kbd_update_irq(void)
+{
+       u8 klevel = 0;
+       u8 mlevel = 0;
+
+       /* First, clear the kbd and aux output buffer full bits */
+       state.status &= ~(I8042_STR_OBF | I8042_STR_AUXDATA);
+
+       if (state.kcount > 0) {
+               state.status |= I8042_STR_OBF;
+               klevel = 1;
+       }
+
+       /* Keyboard has higher priority than mouse */
+       if (klevel == 0 && state.mcount != 0) {
+               state.status |= I8042_STR_OBF | I8042_STR_AUXDATA;
+               mlevel = 1;
+       }
+
+       kvm__irq_line(state.kvm, KBD_IRQ, klevel);
+       kvm__irq_line(state.kvm, AUX_IRQ, mlevel);
+}
+
+/*
+ * Add a byte to the mouse queue, then set IRQs
+ */
+void mouse_queue(u8 c)
+{
+       if (state.mcount >= QUEUE_SIZE)
+               return;
+
+       state.mq[state.mwrite++ % QUEUE_SIZE] = c;
+
+       state.mcount++;
+       kbd_update_irq();
+}
+
+/*
+ * Add a byte to the keyboard queue, then set IRQs
+ */
+void kbd_queue(u8 c)
+{
+       if (state.kcount >= QUEUE_SIZE)
+               return;
+
+       state.kq[state.kwrite++ % QUEUE_SIZE] = c;
+
+       state.kcount++;
+       kbd_update_irq();
+}
+
+static void kbd_write_command(struct kvm *kvm, u8 val)
+{
+       switch (val) {
+       case I8042_CMD_CTL_RCTR:
+               kbd_queue(state.mode);
+               break;
+       case I8042_CMD_CTL_WCTR:
+       case I8042_CMD_AUX_SEND:
+       case I8042_CMD_AUX_LOOP:
+               state.write_cmd = val;
+               break;
+       case I8042_CMD_AUX_TEST:
+               /* 0 means we're a normal PS/2 mouse */
+               mouse_queue(0);
+               break;
+       case I8042_CMD_AUX_DISABLE:
+               state.mode |= MODE_DISABLE_AUX;
+               break;
+       case I8042_CMD_AUX_ENABLE:
+               state.mode &= ~MODE_DISABLE_AUX;
+               break;
+       case I8042_CMD_SYSTEM_RESET:
+               kvm_cpu__reboot();
+               break;
+       default:
+               break;
+       }
+}
+
+/*
+ * Called when the OS reads from port 0x60 (PS/2 data)
+ */
+static u32 kbd_read_data(void)
+{
+       u32 ret;
+       int i;
+
+       if (state.kcount != 0) {
+               /* Keyboard data gets read first */
+               ret = state.kq[state.kread++ % QUEUE_SIZE];
+               state.kcount--;
+               kvm__irq_line(state.kvm, KBD_IRQ, 0);
+               kbd_update_irq();
+       } else if (state.mcount > 0) {
+               /* Followed by the mouse */
+               ret = state.mq[state.mread++ % QUEUE_SIZE];
+               state.mcount--;
+               kvm__irq_line(state.kvm, AUX_IRQ, 0);
+               kbd_update_irq();
+       } else if (state.kcount == 0) {
+               i = state.kread - 1;
+               if (i < 0)
+                       i = QUEUE_SIZE;
+               ret = state.kq[i];
+       }
+       return ret;
+}
+
+/*
+ * Called when the OS read from port 0x64, the command port
+ */
+static u32 kbd_read_status(void)
+{
+       return (u32)state.status;
+}
+
+/*
+ * Called when the OS writes to port 0x60 (data port)
+ * Things written here are generally arguments to commands previously
+ * written to port 0x64 and stored in state.write_cmd
+ */
+static void kbd_write_data(u32 val)
+{
+       switch (state.write_cmd) {
+       case I8042_CMD_CTL_WCTR:
+               state.mode = val;
+               kbd_update_irq();
+               break;
+       case I8042_CMD_AUX_LOOP:
+               mouse_queue(val);
+               mouse_queue(RESPONSE_ACK);
+               break;
+       case I8042_CMD_AUX_SEND:
+               /* The OS wants to send a command to the mouse */
+               mouse_queue(RESPONSE_ACK);
+               switch (val) {
+               case 0xe6:
+                       /* set scaling = 1:1 */
+                       state.mstatus &= ~AUX_SCALING_FLAG;
+                       break;
+               case 0xe8:
+                       /* set resolution */
+                       state.mres = val;
+                       break;
+               case 0xe9:
+                       /* Report mouse status/config */
+                       mouse_queue(state.mstatus);
+                       mouse_queue(state.mres);
+                       mouse_queue(state.msample);
+                       break;
+               case 0xf2:
+                       /* send ID */
+                       mouse_queue(0); /* normal mouse */
+                       break;
+               case 0xf3:
+                       /* set sample rate */
+                       state.msample = val;
+                       break;
+               case 0xf4:
+                       /* enable reporting */
+                       state.mstatus |= AUX_ENABLE_REPORTING;
+                       break;
+               case 0xf5:
+                       state.mstatus &= ~AUX_ENABLE_REPORTING;
+                       break;
+               case 0xf6:
+                       /* set defaults, just fall through to reset */
+               case 0xff:
+                       /* reset */
+                       state.mstatus = 0x0;
+                       state.mres = AUX_DEFAULT_RESOLUTION;
+                       state.msample = AUX_DEFAULT_SAMPLE;
+                       break;
+               default:
+                       break;
+       }
+       break;
+       case 0:
+               /* Just send the ID */
+               kbd_queue(RESPONSE_ACK);
+               kbd_queue(0xab);
+               kbd_queue(0x41);
+               kbd_update_irq();
+               break;
+       default:
+               /* Yeah whatever */
+               break;
+       }
+       state.write_cmd = 0;
+}
+
+static void kbd_reset(void)
+{
+       state = (struct kbd_state) {
+               .status         = I8042_STR_MUXERR | I8042_STR_CMDDAT | I8042_STR_KEYLOCK, /* 0x1c */
+               .mode           = KBD_MODE_KBD_INT | KBD_MODE_SYS, /* 0x3 */
+               .mres           = AUX_DEFAULT_RESOLUTION,
+               .msample        = AUX_DEFAULT_SAMPLE,
+       };
+}
+
+/*
+ * Called when the OS has written to one of the keyboard's ports (0x60 or 0x64)
+ */
+static bool kbd_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       switch (port) {
+       case I8042_COMMAND_REG: {
+               u8 value = kbd_read_status();
+               ioport__write8(data, value);
+               break;
+       }
+       case I8042_DATA_REG: {
+               u32 value = kbd_read_data();
+               ioport__write32(data, value);
+               break;
+       }
+       default:
+               return false;
+       }
+
+       return true;
+}
+
+static bool kbd_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       switch (port) {
+       case I8042_COMMAND_REG: {
+               u8 value = ioport__read8(data);
+               kbd_write_command(kvm, value);
+               break;
+       }
+       case I8042_DATA_REG: {
+               u32 value = ioport__read32(data);
+               kbd_write_data(value);
+               break;
+       }
+       default:
+               return false;
+       }
+
+       return true;
+}
+
+static struct ioport_operations kbd_ops = {
+       .io_in          = kbd_in,
+       .io_out         = kbd_out,
+};
+
+void kbd__init(struct kvm *kvm)
+{
+       kbd_reset();
+       state.kvm = kvm;
+       ioport__register(I8042_DATA_REG, &kbd_ops, 2, NULL);
+       ioport__register(I8042_COMMAND_REG, &kbd_ops, 2, NULL);
+}
diff --git a/tools/kvm/hw/pci-shmem.c b/tools/kvm/hw/pci-shmem.c

new file mode 100644 (file)

index 0000000..ac2d264
--- /dev/null
+++ b/tools/kvm/hw/pci-shmem.c
@@ -0,0 +1,268 @@
+#include "kvm/pci-shmem.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/util.h"
+#include "kvm/ioport.h"
+#include "kvm/ioeventfd.h"
+
+#include <linux/kvm.h>
+#include <linux/byteorder.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+
+static struct pci_device_header pci_shmem_pci_device = {
+       .vendor_id      = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET),
+       .device_id      = cpu_to_le16(0x1110),
+       .header_type    = PCI_HEADER_TYPE_NORMAL,
+       .class[2]       = 0xFF, /* misc pci device */
+       .status         = cpu_to_le16(PCI_STATUS_CAP_LIST),
+       .capabilities   = (void *)&pci_shmem_pci_device.msix - (void *)&pci_shmem_pci_device,
+       .msix.cap       = PCI_CAP_ID_MSIX,
+       .msix.ctrl      = cpu_to_le16(1),
+       .msix.table_offset = cpu_to_le32(1),            /* Use BAR 1 */
+       .msix.pba_offset = cpu_to_le32(0x1001),         /* Use BAR 1 */
+};
+
+/* registers for the Inter-VM shared memory device */
+enum ivshmem_registers {
+       INTRMASK = 0,
+       INTRSTATUS = 4,
+       IVPOSITION = 8,
+       DOORBELL = 12,
+};
+
+static struct shmem_info *shmem_region;
+static u16 ivshmem_registers;
+static int local_fd;
+static u32 local_id;
+static u64 msix_block;
+static u64 msix_pba;
+static struct msix_table msix_table[2];
+
+int pci_shmem__register_mem(struct shmem_info *si)
+{
+       if (shmem_region == NULL) {
+               shmem_region = si;
+       } else {
+               pr_warning("only single shmem currently avail. ignoring.\n");
+               free(si);
+       }
+       return 0;
+}
+
+static bool shmem_pci__io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       u16 offset = port - ivshmem_registers;
+
+       switch (offset) {
+       case INTRMASK:
+               break;
+       case INTRSTATUS:
+               break;
+       case IVPOSITION:
+               ioport__write32(data, local_id);
+               break;
+       case DOORBELL:
+               break;
+       };
+
+       return true;
+}
+
+static bool shmem_pci__io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       u16 offset = port - ivshmem_registers;
+
+       switch (offset) {
+       case INTRMASK:
+               break;
+       case INTRSTATUS:
+               break;
+       case IVPOSITION:
+               break;
+       case DOORBELL:
+               break;
+       };
+
+       return true;
+}
+
+static struct ioport_operations shmem_pci__io_ops = {
+       .io_in  = shmem_pci__io_in,
+       .io_out = shmem_pci__io_out,
+};
+
+static void callback_mmio_msix(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr)
+{
+       void *mem;
+
+       if (addr - msix_block < 0x1000)
+               mem = &msix_table;
+       else
+               mem = &msix_pba;
+
+       if (is_write)
+               memcpy(mem + addr - msix_block, data, len);
+       else
+               memcpy(data, mem + addr - msix_block, len);
+}
+
+/*
+ * Return an irqfd which can be used by other guests to signal this guest
+ * whenever they need to poke it
+ */
+int pci_shmem__get_local_irqfd(struct kvm *kvm)
+{
+       int fd, gsi, r;
+       struct kvm_irqfd irqfd;
+
+       if (local_fd == 0) {
+               fd = eventfd(0, 0);
+               if (fd < 0)
+                       return fd;
+
+               if (pci_shmem_pci_device.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_ENABLE)) {
+                       gsi = irq__add_msix_route(kvm, &msix_table[0].msg);
+               } else {
+                       gsi = pci_shmem_pci_device.irq_line;
+               }
+
+               irqfd = (struct kvm_irqfd) {
+                       .fd = fd,
+                       .gsi = gsi,
+               };
+
+               r = ioctl(kvm->vm_fd, KVM_IRQFD, &irqfd);
+               if (r < 0)
+                       return r;
+
+               local_fd = fd;
+       }
+
+       return local_fd;
+}
+
+/*
+ * Connect a new client to ivshmem by adding the appropriate datamatch
+ * to the DOORBELL
+ */
+int pci_shmem__add_client(struct kvm *kvm, u32 id, int fd)
+{
+       struct kvm_ioeventfd ioevent;
+
+       ioevent = (struct kvm_ioeventfd) {
+               .addr           = ivshmem_registers + DOORBELL,
+               .len            = sizeof(u32),
+               .datamatch      = id,
+               .fd             = fd,
+               .flags          = KVM_IOEVENTFD_FLAG_PIO | KVM_IOEVENTFD_FLAG_DATAMATCH,
+       };
+
+       return ioctl(kvm->vm_fd, KVM_IOEVENTFD, &ioevent);
+}
+
+/*
+ * Remove a client connected to ivshmem by removing the appropriate datamatch
+ * from the DOORBELL
+ */
+int pci_shmem__remove_client(struct kvm *kvm, u32 id)
+{
+       struct kvm_ioeventfd ioevent;
+
+       ioevent = (struct kvm_ioeventfd) {
+               .addr           = ivshmem_registers + DOORBELL,
+               .len            = sizeof(u32),
+               .datamatch      = id,
+               .flags          = KVM_IOEVENTFD_FLAG_PIO
+                               | KVM_IOEVENTFD_FLAG_DATAMATCH
+                               | KVM_IOEVENTFD_FLAG_DEASSIGN,
+       };
+
+       return ioctl(kvm->vm_fd, KVM_IOEVENTFD, &ioevent);
+}
+
+static void *setup_shmem(const char *key, size_t len, int creating)
+{
+       int fd;
+       int rtn;
+       void *mem;
+       int flag = O_RDWR;
+
+       if (creating)
+               flag |= O_CREAT;
+
+       fd = shm_open(key, flag, S_IRUSR | S_IWUSR);
+       if (fd < 0) {
+               pr_warning("Failed to open shared memory file %s\n", key);
+               return NULL;
+       }
+
+       if (creating) {
+               rtn = ftruncate(fd, (off_t) len);
+               if (rtn < 0)
+                       pr_warning("Can't ftruncate(fd,%zu)\n", len);
+       }
+       mem = mmap(NULL, len,
+                  PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, fd, 0);
+       if (mem == MAP_FAILED) {
+               pr_warning("Failed to mmap shared memory file");
+               mem = NULL;
+       }
+       close(fd);
+
+       return mem;
+}
+
+int pci_shmem__init(struct kvm *kvm)
+{
+       u8 dev, line, pin;
+       char *mem;
+       int r;
+
+       if (shmem_region == 0)
+               return 0;
+
+       /* Register good old INTx */
+       if (irq__register_device(PCI_DEVICE_ID_PCI_SHMEM, &dev, &pin, &line) < 0)
+               return 0;
+
+       pci_shmem_pci_device.irq_pin = pin;
+       pci_shmem_pci_device.irq_line = line;
+
+       /* Register MMIO space for MSI-X */
+       r = ioport__register(IOPORT_EMPTY, &shmem_pci__io_ops, IOPORT_SIZE, NULL);
+       if (r < 0)
+               return r;
+       ivshmem_registers = (u16)r;
+
+       msix_block = pci_get_io_space_block(0x1010);
+       kvm__register_mmio(kvm, msix_block, 0x1010, false, callback_mmio_msix, NULL);
+
+       /*
+        * This registers 3 BARs:
+        *
+        * 0 - ivshmem registers
+        * 1 - MSI-X MMIO space
+        * 2 - Shared memory block
+        */
+       pci_shmem_pci_device.bar[0] = cpu_to_le32(ivshmem_registers | PCI_BASE_ADDRESS_SPACE_IO);
+       pci_shmem_pci_device.bar_size[0] = shmem_region->size;
+       pci_shmem_pci_device.bar[1] = cpu_to_le32(msix_block | PCI_BASE_ADDRESS_SPACE_MEMORY);
+       pci_shmem_pci_device.bar_size[1] = 0x1010;
+       pci_shmem_pci_device.bar[2] = cpu_to_le32(shmem_region->phys_addr | PCI_BASE_ADDRESS_SPACE_MEMORY);
+       pci_shmem_pci_device.bar_size[2] = shmem_region->size;
+
+       pci__register(&pci_shmem_pci_device, dev);
+
+       /* Open shared memory and plug it into the guest */
+       mem = setup_shmem(shmem_region->handle, shmem_region->size,
+                               shmem_region->create);
+       if (mem == NULL)
+               return 0;
+       kvm__register_mem(kvm, shmem_region->phys_addr, shmem_region->size,
+                         mem);
+       return 1;
+}
diff --git a/tools/kvm/hw/rtc.c b/tools/kvm/hw/rtc.c

new file mode 100644 (file)

index 0000000..b4f9f1f
--- /dev/null
+++ b/tools/kvm/hw/rtc.c
@@ -0,0 +1,137 @@
+#include "kvm/rtc.h"
+
+#include "kvm/ioport.h"
+#include "kvm/kvm.h"
+
+#include <time.h>
+
+/*
+ * MC146818 RTC registers
+ */
+#define RTC_SECONDS                    0x00
+#define RTC_SECONDS_ALARM              0x01
+#define RTC_MINUTES                    0x02
+#define RTC_MINUTES_ALARM              0x03
+#define RTC_HOURS                      0x04
+#define RTC_HOURS_ALARM                        0x05
+#define RTC_DAY_OF_WEEK                        0x06
+#define RTC_DAY_OF_MONTH               0x07
+#define RTC_MONTH                      0x08
+#define RTC_YEAR                       0x09
+
+#define RTC_REG_A                      0x0A
+#define RTC_REG_B                      0x0B
+#define RTC_REG_C                      0x0C
+#define RTC_REG_D                      0x0D
+
+struct rtc_device {
+       u8                      cmos_idx;
+       u8                      cmos_data[128];
+};
+
+static struct rtc_device       rtc;
+
+static inline unsigned char bin2bcd(unsigned val)
+{
+       return ((val / 10) << 4) + val % 10;
+}
+
+static bool cmos_ram_data_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       struct tm *tm;
+       time_t ti;
+
+       time(&ti);
+
+       tm = gmtime(&ti);
+
+       switch (rtc.cmos_idx) {
+       case RTC_SECONDS:
+               ioport__write8(data, bin2bcd(tm->tm_sec));
+               break;
+       case RTC_MINUTES:
+               ioport__write8(data, bin2bcd(tm->tm_min));
+               break;
+       case RTC_HOURS:
+               ioport__write8(data, bin2bcd(tm->tm_hour));
+               break;
+       case RTC_DAY_OF_WEEK:
+               ioport__write8(data, bin2bcd(tm->tm_wday + 1));
+               break;
+       case RTC_DAY_OF_MONTH:
+               ioport__write8(data, bin2bcd(tm->tm_mday));
+               break;
+       case RTC_MONTH:
+               ioport__write8(data, bin2bcd(tm->tm_mon + 1));
+               break;
+       case RTC_YEAR:
+               ioport__write8(data, bin2bcd(tm->tm_year));
+               break;
+       default:
+               ioport__write8(data, rtc.cmos_data[rtc.cmos_idx]);
+               break;
+       }
+
+       return true;
+}
+
+static bool cmos_ram_data_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       switch (rtc.cmos_idx) {
+       case RTC_REG_C:
+       case RTC_REG_D:
+               /* Read-only */
+               break;
+       default:
+               rtc.cmos_data[rtc.cmos_idx] = ioport__read8(data);
+               break;
+       }
+
+       return true;
+}
+
+static struct ioport_operations cmos_ram_data_ioport_ops = {
+       .io_out         = cmos_ram_data_out,
+       .io_in          = cmos_ram_data_in,
+};
+
+static bool cmos_ram_index_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       u8 value = ioport__read8(data);
+
+       kvm->nmi_disabled       = value & (1UL << 7);
+       rtc.cmos_idx            = value & ~(1UL << 7);
+
+       return true;
+}
+
+static struct ioport_operations cmos_ram_index_ioport_ops = {
+       .io_out         = cmos_ram_index_out,
+};
+
+int rtc__init(struct kvm *kvm)
+{
+       int r = 0;
+
+       /* PORT 0070-007F - CMOS RAM/RTC (REAL TIME CLOCK) */
+       r = ioport__register(0x0070, &cmos_ram_index_ioport_ops, 1, NULL);
+       if (r < 0)
+               return r;
+
+       r = ioport__register(0x0071, &cmos_ram_data_ioport_ops, 1, NULL);
+       if (r < 0) {
+               ioport__unregister(0x0071);
+               return r;
+       }
+
+       return r;
+}
+
+int rtc__exit(struct kvm *kvm)
+{
+       /* PORT 0070-007F - CMOS RAM/RTC (REAL TIME CLOCK) */
+       ioport__unregister(0x0070);
+       ioport__unregister(0x0071);
+
+       return 0;
+}
+\ No newline at end of file
diff --git a/tools/kvm/hw/serial.c b/tools/kvm/hw/serial.c

new file mode 100644 (file)

index 0000000..956307c
--- /dev/null
+++ b/tools/kvm/hw/serial.c
@@ -0,0 +1,446 @@
+#include "kvm/8250-serial.h"
+
+#include "kvm/read-write.h"
+#include "kvm/ioport.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "kvm/kvm.h"
+
+#include <linux/types.h>
+#include <linux/serial_reg.h>
+
+#include <pthread.h>
+
+/*
+ * This fakes a U6_16550A. The fifo len needs to be 64 as the kernel
+ * expects that for autodetection.
+ */
+#define FIFO_LEN               64
+#define FIFO_MASK              (FIFO_LEN - 1)
+
+#define UART_IIR_TYPE_BITS     0xc0
+
+struct serial8250_device {
+       pthread_mutex_t         mutex;
+       u8                      id;
+
+       u16                     iobase;
+       u8                      irq;
+       u8                      irq_state;
+       int                     txcnt;
+       int                     rxcnt;
+       int                     rxdone;
+       char                    txbuf[FIFO_LEN];
+       char                    rxbuf[FIFO_LEN];
+
+       u8                      dll;
+       u8                      dlm;
+       u8                      iir;
+       u8                      ier;
+       u8                      fcr;
+       u8                      lcr;
+       u8                      mcr;
+       u8                      lsr;
+       u8                      msr;
+       u8                      scr;
+};
+
+#define SERIAL_REGS_SETTING \
+       .iir                    = UART_IIR_NO_INT, \
+       .lsr                    = UART_LSR_TEMT | UART_LSR_THRE, \
+       .msr                    = UART_MSR_DCD | UART_MSR_DSR | UART_MSR_CTS, \
+       .mcr                    = UART_MCR_OUT2,
+
+static struct serial8250_device devices[] = {
+       /* ttyS0 */
+       [0]     = {
+               .mutex                  = PTHREAD_MUTEX_INITIALIZER,
+
+               .id                     = 0,
+               .iobase                 = 0x3f8,
+               .irq                    = 4,
+
+               SERIAL_REGS_SETTING
+       },
+       /* ttyS1 */
+       [1]     = {
+               .mutex                  = PTHREAD_MUTEX_INITIALIZER,
+
+               .id                     = 1,
+               .iobase                 = 0x2f8,
+               .irq                    = 3,
+
+               SERIAL_REGS_SETTING
+       },
+       /* ttyS2 */
+       [2]     = {
+               .mutex                  = PTHREAD_MUTEX_INITIALIZER,
+
+               .id                     = 2,
+               .iobase                 = 0x3e8,
+               .irq                    = 4,
+
+               SERIAL_REGS_SETTING
+       },
+       /* ttyS3 */
+       [3]     = {
+               .mutex                  = PTHREAD_MUTEX_INITIALIZER,
+
+               .id                     = 3,
+               .iobase                 = 0x2e8,
+               .irq                    = 3,
+
+               SERIAL_REGS_SETTING
+       },
+};
+
+static void serial8250_flush_tx(struct serial8250_device *dev)
+{
+       dev->lsr |= UART_LSR_TEMT | UART_LSR_THRE;
+
+       if (dev->txcnt) {
+               term_putc(CONSOLE_8250, dev->txbuf, dev->txcnt, dev->id);
+               dev->txcnt = 0;
+       }
+}
+
+static void serial8250_update_irq(struct kvm *kvm, struct serial8250_device *dev)
+{
+       u8 iir = 0;
+
+       /* Handle clear rx */
+       if (dev->lcr & UART_FCR_CLEAR_RCVR) {
+               dev->lcr &= ~UART_FCR_CLEAR_RCVR;
+               dev->rxcnt = dev->rxdone = 0;
+               dev->lsr &= ~UART_LSR_DR;
+       }
+
+       /* Handle clear tx */
+       if (dev->lcr & UART_FCR_CLEAR_XMIT) {
+               dev->lcr &= ~UART_FCR_CLEAR_XMIT;
+               dev->txcnt = 0;
+               dev->lsr |= UART_LSR_TEMT | UART_LSR_THRE;
+       }
+
+       /* Data ready and rcv interrupt enabled ? */
+       if ((dev->ier & UART_IER_RDI) && (dev->lsr & UART_LSR_DR))
+               iir |= UART_IIR_RDI;
+
+       /* Transmitter empty and interrupt enabled ? */
+       if ((dev->ier & UART_IER_THRI) && (dev->lsr & UART_LSR_TEMT))
+               iir |= UART_IIR_THRI;
+
+       /* Now update the irq line, if necessary */
+       if (!iir) {
+               dev->iir = UART_IIR_NO_INT;
+               if (dev->irq_state)
+                       kvm__irq_line(kvm, dev->irq, 0);
+       } else {
+               dev->iir = iir;
+               if (!dev->irq_state)
+                       kvm__irq_line(kvm, dev->irq, 1);
+       }
+       dev->irq_state = iir;
+
+       /*
+        * If the kernel disabled the tx interrupt, we know that there
+        * is nothing more to transmit, so we can reset our tx logic
+        * here.
+        */
+       if (!(dev->ier & UART_IER_THRI))
+               serial8250_flush_tx(dev);
+}
+
+#define SYSRQ_PENDING_NONE             0
+
+static int sysrq_pending;
+
+static void serial8250__sysrq(struct kvm *kvm, struct serial8250_device *dev)
+{
+       dev->lsr |= UART_LSR_DR | UART_LSR_BI;
+       dev->rxbuf[dev->rxcnt++] = sysrq_pending;
+       sysrq_pending   = SYSRQ_PENDING_NONE;
+}
+
+static void serial8250__receive(struct kvm *kvm, struct serial8250_device *dev,
+                               bool handle_sysrq)
+{
+       int c;
+
+       /*
+        * If the guest transmitted a full fifo, we clear the
+        * TEMT/THRE bits to let the kernel escape from the 8250
+        * interrupt handler. We come here only once a ms, so that
+        * should give the kernel the desired pause. That also flushes
+        * the tx fifo to the terminal.
+        */
+       serial8250_flush_tx(dev);
+
+       if (dev->mcr & UART_MCR_LOOP)
+               return;
+
+       if ((dev->lsr & UART_LSR_DR) || dev->rxcnt)
+               return;
+
+       if (handle_sysrq && sysrq_pending) {
+               serial8250__sysrq(kvm, dev);
+               return;
+       }
+
+       while (term_readable(CONSOLE_8250, dev->id) &&
+              dev->rxcnt < FIFO_LEN) {
+
+               c = term_getc(CONSOLE_8250, dev->id);
+
+               if (c < 0)
+                       break;
+               dev->rxbuf[dev->rxcnt++] = c;
+               dev->lsr |= UART_LSR_DR;
+       }
+}
+
+void serial8250__update_consoles(struct kvm *kvm)
+{
+       unsigned int i;
+
+       for (i = 0; i < ARRAY_SIZE(devices); i++) {
+               struct serial8250_device *dev = &devices[i];
+
+               mutex_lock(&dev->mutex);
+
+               /* Restrict sysrq injection to the first port */
+               serial8250__receive(kvm, dev, i == 0);
+
+               serial8250_update_irq(kvm, dev);
+
+               mutex_unlock(&dev->mutex);
+       }
+}
+
+void serial8250__inject_sysrq(struct kvm *kvm, char sysrq)
+{
+       sysrq_pending = sysrq;
+}
+
+static struct serial8250_device *find_device(u16 port)
+{
+       unsigned int i;
+
+       for (i = 0; i < ARRAY_SIZE(devices); i++) {
+               struct serial8250_device *dev = &devices[i];
+
+               if (dev->iobase == (port & ~0x7))
+                       return dev;
+       }
+       return NULL;
+}
+
+static bool serial8250_out(struct ioport *ioport, struct kvm *kvm, u16 port,
+                          void *data, int size)
+{
+       struct serial8250_device *dev;
+       u16 offset;
+       bool ret = true;
+       char *addr = data;
+
+       dev = find_device(port);
+       if (!dev)
+               return false;
+
+       mutex_lock(&dev->mutex);
+
+       offset = port - dev->iobase;
+
+       switch (offset) {
+       case UART_TX:
+               if (dev->lcr & UART_LCR_DLAB) {
+                       dev->dll = ioport__read8(data);
+                       break;
+               }
+
+               /* Loopback mode */
+               if (dev->mcr & UART_MCR_LOOP) {
+                       if (dev->rxcnt < FIFO_LEN) {
+                               dev->rxbuf[dev->rxcnt++] = *addr;
+                               dev->lsr |= UART_LSR_DR;
+                       }
+                       break;
+               }
+
+               if (dev->txcnt < FIFO_LEN) {
+                       dev->txbuf[dev->txcnt++] = *addr;
+                       dev->lsr &= ~UART_LSR_TEMT;
+                       if (dev->txcnt == FIFO_LEN / 2)
+                               dev->lsr &= ~UART_LSR_THRE;
+               } else {
+                       /* Should never happpen */
+                       dev->lsr &= ~(UART_LSR_TEMT | UART_LSR_THRE);
+               }
+               break;
+       case UART_IER:
+               if (!(dev->lcr & UART_LCR_DLAB))
+                       dev->ier = ioport__read8(data) & 0x0f;
+               else
+                       dev->dlm = ioport__read8(data);
+               break;
+       case UART_FCR:
+               dev->fcr = ioport__read8(data);
+               break;
+       case UART_LCR:
+               dev->lcr = ioport__read8(data);
+               break;
+       case UART_MCR:
+               dev->mcr = ioport__read8(data);
+               break;
+       case UART_LSR:
+               /* Factory test */
+               break;
+       case UART_MSR:
+               /* Not used */
+               break;
+       case UART_SCR:
+               dev->scr = ioport__read8(data);
+               break;
+       default:
+               ret = false;
+               break;
+       }
+
+       serial8250_update_irq(kvm, dev);
+
+       mutex_unlock(&dev->mutex);
+
+       return ret;
+}
+
+static void serial8250_rx(struct serial8250_device *dev, void *data)
+{
+       if (dev->rxdone == dev->rxcnt)
+               return;
+
+       /* Break issued ? */
+       if (dev->lsr & UART_LSR_BI) {
+               dev->lsr &= ~UART_LSR_BI;
+               ioport__write8(data, 0);
+               return;
+       }
+
+       ioport__write8(data, dev->rxbuf[dev->rxdone++]);
+       if (dev->rxcnt == dev->rxdone) {
+               dev->lsr &= ~UART_LSR_DR;
+               dev->rxcnt = dev->rxdone = 0;
+       }
+}
+
+static bool serial8250_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       struct serial8250_device *dev;
+       u16 offset;
+       bool ret = true;
+
+       dev = find_device(port);
+       if (!dev)
+               return false;
+
+       mutex_lock(&dev->mutex);
+
+       offset = port - dev->iobase;
+
+       switch (offset) {
+       case UART_RX:
+               if (dev->lcr & UART_LCR_DLAB)
+                       ioport__write8(data, dev->dll);
+               else
+                       serial8250_rx(dev, data);
+               break;
+       case UART_IER:
+               if (dev->lcr & UART_LCR_DLAB)
+                       ioport__write8(data, dev->dlm);
+               else
+                       ioport__write8(data, dev->ier);
+               break;
+       case UART_IIR:
+               ioport__write8(data, dev->iir | UART_IIR_TYPE_BITS);
+               break;
+       case UART_LCR:
+               ioport__write8(data, dev->lcr);
+               break;
+       case UART_MCR:
+               ioport__write8(data, dev->mcr);
+               break;
+       case UART_LSR:
+               ioport__write8(data, dev->lsr);
+               break;
+       case UART_MSR:
+               ioport__write8(data, dev->msr);
+               break;
+       case UART_SCR:
+               ioport__write8(data, dev->scr);
+               break;
+       default:
+               ret = false;
+               break;
+       }
+
+       serial8250_update_irq(kvm, dev);
+
+       mutex_unlock(&dev->mutex);
+
+       return ret;
+}
+
+static struct ioport_operations serial8250_ops = {
+       .io_in          = serial8250_in,
+       .io_out         = serial8250_out,
+};
+
+static int serial8250__device_init(struct kvm *kvm, struct serial8250_device *dev)
+{
+       int r;
+
+       r = ioport__register(dev->iobase, &serial8250_ops, 8, NULL);
+       kvm__irq_line(kvm, dev->irq, 0);
+
+       return r;
+}
+
+int serial8250__init(struct kvm *kvm)
+{
+       unsigned int i, j;
+       int r = 0;
+
+       for (i = 0; i < ARRAY_SIZE(devices); i++) {
+               struct serial8250_device *dev = &devices[i];
+
+               r = serial8250__device_init(kvm, dev);
+               if (r < 0)
+                       goto cleanup;
+       }
+
+       return r;
+cleanup:
+       for (j = 0; j <= i; j++) {
+               struct serial8250_device *dev = &devices[j];
+
+               ioport__unregister(dev->iobase);
+       }
+
+       return r;
+}
+
+int serial8250__exit(struct kvm *kvm)
+{
+       unsigned int i;
+       int r;
+
+       for (i = 0; i < ARRAY_SIZE(devices); i++) {
+               struct serial8250_device *dev = &devices[i];
+
+               r = ioport__unregister(dev->iobase);
+               if (r < 0)
+                       return r;
+       }
+
+       return 0;
+}
diff --git a/tools/kvm/hw/vesa.c b/tools/kvm/hw/vesa.c

new file mode 100644 (file)

index 0000000..757f0a2
--- /dev/null
+++ b/tools/kvm/hw/vesa.c
@@ -0,0 +1,85 @@
+#include "kvm/vesa.h"
+
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/framebuffer.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+
+#include <linux/byteorder.h>
+#include <sys/mman.h>
+#include <linux/err.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <inttypes.h>
+#include <unistd.h>
+
+static bool vesa_pci_io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       return true;
+}
+
+static bool vesa_pci_io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       return true;
+}
+
+static struct ioport_operations vesa_io_ops = {
+       .io_in                  = vesa_pci_io_in,
+       .io_out                 = vesa_pci_io_out,
+};
+
+static struct pci_device_header vesa_pci_device = {
+       .vendor_id              = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET),
+       .device_id              = cpu_to_le16(PCI_DEVICE_ID_VESA),
+       .header_type            = PCI_HEADER_TYPE_NORMAL,
+       .revision_id            = 0,
+       .class[2]               = 0x03,
+       .subsys_vendor_id       = cpu_to_le16(PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET),
+       .subsys_id              = cpu_to_le16(PCI_SUBSYSTEM_ID_VESA),
+       .bar[1]                 = cpu_to_le32(VESA_MEM_ADDR | PCI_BASE_ADDRESS_SPACE_MEMORY),
+       .bar_size[1]            = VESA_MEM_SIZE,
+};
+
+static struct framebuffer vesafb;
+
+struct framebuffer *vesa__init(struct kvm *kvm)
+{
+       u16 vesa_base_addr;
+       u8 dev, line, pin;
+       char *mem;
+       int r;
+
+       r = irq__register_device(PCI_DEVICE_ID_VESA, &dev, &pin, &line);
+       if (r < 0)
+               return ERR_PTR(r);
+
+       r = ioport__register(IOPORT_EMPTY, &vesa_io_ops, IOPORT_SIZE, NULL);
+       if (r < 0)
+               return ERR_PTR(r);
+
+       vesa_pci_device.irq_pin         = pin;
+       vesa_pci_device.irq_line        = line;
+       vesa_base_addr                  = (u16)r;
+       vesa_pci_device.bar[0]          = cpu_to_le32(vesa_base_addr | PCI_BASE_ADDRESS_SPACE_IO);
+       pci__register(&vesa_pci_device, dev);
+
+       mem = mmap(NULL, VESA_MEM_SIZE, PROT_RW, MAP_ANON_NORESERVE, -1, 0);
+       if (mem == MAP_FAILED)
+               ERR_PTR(-errno);
+
+       kvm__register_mem(kvm, VESA_MEM_ADDR, VESA_MEM_SIZE, mem);
+
+       vesafb = (struct framebuffer) {
+               .width                  = VESA_WIDTH,
+               .height                 = VESA_HEIGHT,
+               .depth                  = VESA_BPP,
+               .mem                    = mem,
+               .mem_addr               = VESA_MEM_ADDR,
+               .mem_size               = VESA_MEM_SIZE,
+       };
+       return fb__register(&vesafb);
+}
diff --git a/tools/kvm/include/asm/hweight.h b/tools/kvm/include/asm/hweight.h

new file mode 100644 (file)

index 0000000..1a43977
--- /dev/null
+++ b/tools/kvm/include/asm/hweight.h
@@ -0,0 +1,8 @@
+#ifndef _KVM_ASM_HWEIGHT_H_
+#define _KVM_ASM_HWEIGHT_H_
+
+#include <linux/types.h>
+unsigned int hweight32(unsigned int w);
+unsigned long hweight64(__u64 w);
+
+#endif /* _KVM_ASM_HWEIGHT_H_ */
diff --git a/tools/kvm/include/bios/memcpy.h b/tools/kvm/include/bios/memcpy.h

new file mode 100644 (file)

index 0000000..e021044
--- /dev/null
+++ b/tools/kvm/include/bios/memcpy.h
@@ -0,0 +1,9 @@
+#ifndef KVM_BIOS_MEMCPY_H
+#define KVM_BIOS_MEMCPY_H
+
+#include <linux/types.h>
+#include <stddef.h>
+
+void memcpy16(u16 dst_seg, void *dst, u16 src_seg, const void *src, size_t len);
+
+#endif /* KVM_BIOS_MEMCPY_H */
diff --git a/tools/kvm/include/kvm/8250-serial.h b/tools/kvm/include/kvm/8250-serial.h

new file mode 100644 (file)

index 0000000..e954551
--- /dev/null
+++ b/tools/kvm/include/kvm/8250-serial.h
@@ -0,0 +1,11 @@
+#ifndef KVM__8250_SERIAL_H
+#define KVM__8250_SERIAL_H
+
+struct kvm;
+
+int serial8250__init(struct kvm *kvm);
+int serial8250__exit(struct kvm *kvm);
+void serial8250__update_consoles(struct kvm *kvm);
+void serial8250__inject_sysrq(struct kvm *kvm, char sysrq);
+
+#endif /* KVM__8250_SERIAL_H */
diff --git a/tools/kvm/include/kvm/apic.h b/tools/kvm/include/kvm/apic.h

new file mode 100644 (file)

index 0000000..2129997
--- /dev/null
+++ b/tools/kvm/include/kvm/apic.h
@@ -0,0 +1,17 @@
+#ifndef KVM_APIC_H_
+#define KVM_APIC_H_
+
+#include <asm/apicdef.h>
+
+/*
+ * APIC, IOAPIC stuff
+ */
+#define APIC_BASE_ADDR_STEP    0x00400000
+#define IOAPIC_BASE_ADDR_STEP  0x00100000
+
+#define APIC_ADDR(apic)                (APIC_DEFAULT_PHYS_BASE + apic * APIC_BASE_ADDR_STEP)
+#define IOAPIC_ADDR(ioapic)    (IO_APIC_DEFAULT_PHYS_BASE + ioapic * IOAPIC_BASE_ADDR_STEP)
+
+#define KVM_APIC_VERSION       0x14 /* xAPIC */
+
+#endif /* KVM_APIC_H_ */
diff --git a/tools/kvm/include/kvm/brlock.h b/tools/kvm/include/kvm/brlock.h

new file mode 100644 (file)

index 0000000..bd1d882
--- /dev/null
+++ b/tools/kvm/include/kvm/brlock.h
@@ -0,0 +1,41 @@
+#ifndef KVM__BRLOCK_H
+#define KVM__BRLOCK_H
+
+#include "kvm/kvm.h"
+#include "kvm/barrier.h"
+
+/*
+ * brlock is a lock which is very cheap for reads, but very expensive
+ * for writes.
+ * This lock will be used when updates are very rare and reads are common.
+ * This lock is currently implemented by stopping the guest while
+ * performing the updates. We assume that the only threads whichread from
+ * the locked data are VCPU threads, and the only writer isn't a VCPU thread.
+ */
+
+#ifndef barrier
+#define barrier()              __asm__ __volatile__("": : :"memory")
+#endif
+
+#ifdef KVM_BRLOCK_DEBUG
+
+#include "kvm/rwsem.h"
+
+DECLARE_RWSEM(brlock_sem);
+
+#define br_read_lock()         down_read(&brlock_sem);
+#define br_read_unlock()       up_read(&brlock_sem);
+
+#define br_write_lock()                down_write(&brlock_sem);
+#define br_write_unlock()      up_write(&brlock_sem);
+
+#else
+
+#define br_read_lock()         barrier()
+#define br_read_unlock()       barrier()
+
+#define br_write_lock()                kvm__pause()
+#define br_write_unlock()      kvm__continue()
+#endif
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-balloon.h b/tools/kvm/include/kvm/builtin-balloon.h

new file mode 100644 (file)

index 0000000..77ee656
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-balloon.h
@@ -0,0 +1,9 @@
+#ifndef KVM__BALLOON_H
+#define KVM__BALLOON_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_balloon(int argc, const char **argv, const char *prefix);
+void kvm_balloon_help(void) NORETURN;
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-debug.h b/tools/kvm/include/kvm/builtin-debug.h

new file mode 100644 (file)

index 0000000..efa0268
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-debug.h
@@ -0,0 +1,20 @@
+#ifndef KVM__DEBUG_H
+#define KVM__DEBUG_H
+
+#include <kvm/util.h>
+#include <linux/types.h>
+
+#define KVM_DEBUG_CMD_TYPE_DUMP        (1 << 0)
+#define KVM_DEBUG_CMD_TYPE_NMI (1 << 1)
+#define KVM_DEBUG_CMD_TYPE_SYSRQ (1 << 2)
+
+struct debug_cmd_params {
+       u32 dbg_type;
+       u32 cpu;
+       char sysrq;
+};
+
+int kvm_cmd_debug(int argc, const char **argv, const char *prefix);
+void kvm_debug_help(void) NORETURN;
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-help.h b/tools/kvm/include/kvm/builtin-help.h

new file mode 100644 (file)

index 0000000..2946743
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-help.h
@@ -0,0 +1,6 @@
+#ifndef __KVM_HELP_H__
+#define __KVM_HELP_H__
+
+int kvm_cmd_help(int argc, const char **argv, const char *prefix);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-list.h b/tools/kvm/include/kvm/builtin-list.h

new file mode 100644 (file)

index 0000000..47029ca
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-list.h
@@ -0,0 +1,10 @@
+#ifndef KVM__LIST_H
+#define KVM__LIST_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_list(int argc, const char **argv, const char *prefix);
+void kvm_list_help(void) NORETURN;
+int get_vmstate(int sock);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-pause.h b/tools/kvm/include/kvm/builtin-pause.h

new file mode 100644 (file)

index 0000000..84aaee3
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-pause.h
@@ -0,0 +1,9 @@
+#ifndef KVM__PAUSE_H
+#define KVM__PAUSE_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_pause(int argc, const char **argv, const char *prefix);
+void kvm_pause_help(void) NORETURN;
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-resume.h b/tools/kvm/include/kvm/builtin-resume.h

new file mode 100644 (file)

index 0000000..7de999b
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-resume.h
@@ -0,0 +1,9 @@
+#ifndef KVM__RESUME_H
+#define KVM__RESUME_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_resume(int argc, const char **argv, const char *prefix);
+void kvm_resume_help(void) NORETURN;
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-run.h b/tools/kvm/include/kvm/builtin-run.h

new file mode 100644 (file)

index 0000000..91521a5
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-run.h
@@ -0,0 +1,11 @@
+#ifndef __KVM_RUN_H__
+#define __KVM_RUN_H__
+
+#include <kvm/util.h>
+
+int kvm_cmd_run(int argc, const char **argv, const char *prefix);
+void kvm_run_help(void) NORETURN;
+
+void kvm_run_set_wrapper_sandbox(void);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-sandbox.h b/tools/kvm/include/kvm/builtin-sandbox.h

new file mode 100644 (file)

index 0000000..98cd6be
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-sandbox.h
@@ -0,0 +1,6 @@
+#ifndef KVM__SANDBOX_H
+#define KVM__SANDBOX_H
+
+int kvm_cmd_sandbox(int argc, const char **argv, const char *prefix);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-setup.h b/tools/kvm/include/kvm/builtin-setup.h

new file mode 100644 (file)

index 0000000..4a8d7ee
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-setup.h
@@ -0,0 +1,11 @@
+#ifndef KVM__SETUP_H
+#define KVM__SETUP_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_setup(int argc, const char **argv, const char *prefix);
+void kvm_setup_help(void) NORETURN;
+int kvm_setup_create_new(const char *guestfs_name);
+void kvm_setup_resolv(const char *guestfs_name);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-stat.h b/tools/kvm/include/kvm/builtin-stat.h

new file mode 100644 (file)

index 0000000..4fecb37
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-stat.h
@@ -0,0 +1,9 @@
+#ifndef KVM__STAT_H
+#define KVM__STAT_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_stat(int argc, const char **argv, const char *prefix);
+void kvm_stat_help(void) NORETURN;
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-stop.h b/tools/kvm/include/kvm/builtin-stop.h

new file mode 100644 (file)

index 0000000..b26b275
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-stop.h
@@ -0,0 +1,9 @@
+#ifndef KVM__STOP_H
+#define KVM__STOP_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_stop(int argc, const char **argv, const char *prefix);
+void kvm_stop_help(void) NORETURN;
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-version.h b/tools/kvm/include/kvm/builtin-version.h

new file mode 100644 (file)

index 0000000..83cac4d
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-version.h
@@ -0,0 +1,6 @@
+#ifndef KVM__VERSION_H
+#define KVM__VERSION_H
+
+int kvm_cmd_version(int argc, const char **argv, const char *prefix);
+
+#endif
diff --git a/tools/kvm/include/kvm/compiler.h b/tools/kvm/include/kvm/compiler.h

new file mode 100644 (file)

index 0000000..2013a83
--- /dev/null
+++ b/tools/kvm/include/kvm/compiler.h
@@ -0,0 +1,10 @@
+#ifndef KVM_COMPILER_H_
+#define KVM_COMPILER_H_
+
+#ifndef __compiletime_error
+# define __compiletime_error(message)
+#endif
+
+#define notrace __attribute__((no_instrument_function))
+
+#endif /* KVM_COMPILER_H_ */
diff --git a/tools/kvm/include/kvm/disk-image.h b/tools/kvm/include/kvm/disk-image.h

new file mode 100644 (file)

index 0000000..83fc725
--- /dev/null
+++ b/tools/kvm/include/kvm/disk-image.h
@@ -0,0 +1,95 @@
+#ifndef KVM__DISK_IMAGE_H
+#define KVM__DISK_IMAGE_H
+
+#include "kvm/read-write.h"
+#include "kvm/util.h"
+
+#include <linux/types.h>
+#include <linux/fs.h>  /* for BLKGETSIZE64 */
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <sys/uio.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define SECTOR_SHIFT           9
+#define SECTOR_SIZE            (1UL << SECTOR_SHIFT)
+
+enum {
+       DISK_IMAGE_REGULAR,
+       DISK_IMAGE_MMAP,
+};
+
+#define MAX_DISK_IMAGES         4
+
+struct disk_image;
+
+struct disk_image_operations {
+       ssize_t (*read)(struct disk_image *disk, u64 sector, const struct iovec *iov,
+                       int iovcount, void *param);
+       ssize_t (*write)(struct disk_image *disk, u64 sector, const struct iovec *iov,
+                       int iovcount, void *param);
+       int (*flush)(struct disk_image *disk);
+       int (*close)(struct disk_image *disk);
+};
+
+struct disk_image_params {
+       const char *filename;
+       /*
+        * wwpn == World Wide Port Number
+        * tpgt == Target Portal Group Tag
+        */
+       const char *wwpn;
+       const char *tpgt;
+       bool readonly;
+       bool direct;
+};
+
+struct disk_image {
+       int                             fd;
+       u64                             size;
+       struct disk_image_operations    *ops;
+       void                            *priv;
+       void                            *disk_req_cb_param;
+       void                            (*disk_req_cb)(void *param, long len);
+       bool                            async;
+       int                             evt;
+#ifdef CONFIG_HAS_AIO
+       io_context_t                    ctx;
+#endif
+       const char                      *wwpn;
+       const char                      *tpgt;
+};
+
+struct disk_image *disk_image__open(const char *filename, bool readonly, bool direct);
+struct disk_image **disk_image__open_all(struct disk_image_params *params, int count);
+struct disk_image *disk_image__new(int fd, u64 size, struct disk_image_operations *ops, int mmap);
+int disk_image__close(struct disk_image *disk);
+int disk_image__close_all(struct disk_image **disks, int count);
+int disk_image__flush(struct disk_image *disk);
+ssize_t disk_image__read(struct disk_image *disk, u64 sector, const struct iovec *iov,
+                               int iovcount, void *param);
+ssize_t disk_image__write(struct disk_image *disk, u64 sector, const struct iovec *iov,
+                               int iovcount, void *param);
+ssize_t disk_image__get_serial(struct disk_image *disk, void *buffer, ssize_t *len);
+
+struct disk_image *raw_image__probe(int fd, struct stat *st, bool readonly);
+struct disk_image *blkdev__probe(const char *filename, int flags, struct stat *st);
+
+ssize_t raw_image__read(struct disk_image *disk, u64 sector,
+                               const struct iovec *iov, int iovcount, void *param);
+ssize_t raw_image__write(struct disk_image *disk, u64 sector,
+                               const struct iovec *iov, int iovcount, void *param);
+ssize_t raw_image__read_mmap(struct disk_image *disk, u64 sector,
+                               const struct iovec *iov, int iovcount, void *param);
+ssize_t raw_image__write_mmap(struct disk_image *disk, u64 sector,
+                               const struct iovec *iov, int iovcount, void *param);
+int raw_image__close(struct disk_image *disk);
+void disk_image__set_callback(struct disk_image *disk, void (*disk_req_cb)(void *param, long len));
+#endif /* KVM__DISK_IMAGE_H */
diff --git a/tools/kvm/include/kvm/e820.h b/tools/kvm/include/kvm/e820.h

new file mode 100644 (file)

index 0000000..15f62cc
--- /dev/null
+++ b/tools/kvm/include/kvm/e820.h
@@ -0,0 +1,13 @@
+#ifndef KVM_E820_H
+#define KVM_E820_H
+
+#include <linux/types.h>
+#include <kvm/bios.h>
+
+#define SMAP    0x534d4150      /* ASCII "SMAP" */
+
+struct biosregs;
+
+extern bioscall void e820_query_map(struct biosregs *regs);
+
+#endif /* KVM_E820_H */
diff --git a/tools/kvm/include/kvm/framebuffer.h b/tools/kvm/include/kvm/framebuffer.h

new file mode 100644 (file)

index 0000000..dc5022c
--- /dev/null
+++ b/tools/kvm/include/kvm/framebuffer.h
@@ -0,0 +1,35 @@
+#ifndef KVM__FRAMEBUFFER_H
+#define KVM__FRAMEBUFFER_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+struct framebuffer;
+
+struct fb_target_operations {
+       int (*start)(struct framebuffer *fb);
+       int (*stop)(struct framebuffer *fb);
+};
+
+#define FB_MAX_TARGETS                 2
+
+struct framebuffer {
+       struct list_head                node;
+
+       u32                             width;
+       u32                             height;
+       u8                              depth;
+       char                            *mem;
+       u64                             mem_addr;
+       u64                             mem_size;
+
+       unsigned long                   nr_targets;
+       struct fb_target_operations     *targets[FB_MAX_TARGETS];
+};
+
+struct framebuffer *fb__register(struct framebuffer *fb);
+int fb__attach(struct framebuffer *fb, struct fb_target_operations *ops);
+int fb__start(void);
+void fb__stop(void);
+
+#endif /* KVM__FRAMEBUFFER_H */
diff --git a/tools/kvm/include/kvm/guest_compat.h b/tools/kvm/include/kvm/guest_compat.h

new file mode 100644 (file)

index 0000000..ae7abbd
--- /dev/null
+++ b/tools/kvm/include/kvm/guest_compat.h
@@ -0,0 +1,9 @@
+#ifndef KVM__GUEST_COMPAT_H
+#define KVM__GUEST_COMPAT_H
+
+int compat__print_all_messages(void);
+int compat__remove_message(int id);
+int compat__add_message(const char *title, const char *description);
+
+
+#endif
+\ No newline at end of file
diff --git a/tools/kvm/include/kvm/i8042.h b/tools/kvm/include/kvm/i8042.h

new file mode 100644 (file)

index 0000000..13f18e2
--- /dev/null
+++ b/tools/kvm/include/kvm/i8042.h
@@ -0,0 +1,12 @@
+#ifndef KVM__PCKBD_H
+#define KVM__PCKBD_H
+
+#include <linux/types.h>
+
+struct kvm;
+
+void mouse_queue(u8 c);
+void kbd_queue(u8 c);
+void kbd__init(struct kvm *kvm);
+
+#endif
diff --git a/tools/kvm/include/kvm/ioeventfd.h b/tools/kvm/include/kvm/ioeventfd.h

new file mode 100644 (file)

index 0000000..d71fa40
--- /dev/null
+++ b/tools/kvm/include/kvm/ioeventfd.h
@@ -0,0 +1,28 @@
+#ifndef KVM__IOEVENTFD_H
+#define KVM__IOEVENTFD_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <sys/eventfd.h>
+#include "kvm/util.h"
+
+struct kvm;
+
+struct ioevent {
+       u64                     io_addr;
+       u8                      io_len;
+       void                    (*fn)(struct kvm *kvm, void *ptr);
+       struct kvm              *fn_kvm;
+       void                    *fn_ptr;
+       int                     fd;
+       u64                     datamatch;
+
+       struct list_head        list;
+};
+
+int ioeventfd__init(struct kvm *kvm);
+int ioeventfd__exit(struct kvm *kvm);
+int ioeventfd__add_event(struct ioevent *ioevent, bool is_pio, bool poll_in_userspace);
+int ioeventfd__del_event(u64 addr, u64 datamatch);
+
+#endif
diff --git a/tools/kvm/include/kvm/ioport.h b/tools/kvm/include/kvm/ioport.h

new file mode 100644 (file)

index 0000000..ced8cf5
--- /dev/null
+++ b/tools/kvm/include/kvm/ioport.h
@@ -0,0 +1,69 @@
+#ifndef KVM__IOPORT_H
+#define KVM__IOPORT_H
+
+#include "kvm/rbtree-interval.h"
+
+#include <stdbool.h>
+#include <limits.h>
+#include <asm/types.h>
+#include <linux/types.h>
+#include <linux/byteorder.h>
+
+/* some ports we reserve for own use */
+#define IOPORT_DBG                     0xe0
+#define IOPORT_START                   0x6200
+#define IOPORT_SIZE                    0x400
+
+#define IOPORT_EMPTY                   USHRT_MAX
+
+struct kvm;
+
+struct ioport {
+       struct rb_int_node              node;
+       struct ioport_operations        *ops;
+       void                            *priv;
+};
+
+struct ioport_operations {
+       bool (*io_in)(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size);
+       bool (*io_out)(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size);
+};
+
+void ioport__setup_arch(void);
+
+int ioport__register(u16 port, struct ioport_operations *ops, int count, void *param);
+int ioport__unregister(u16 port);
+int ioport__init(struct kvm *kvm);
+int ioport__exit(struct kvm *kvm);
+
+static inline u8 ioport__read8(u8 *data)
+{
+       return *data;
+}
+/* On BE platforms, PCI I/O is byteswapped, i.e. LE, so swap back. */
+static inline u16 ioport__read16(u16 *data)
+{
+       return le16_to_cpu(*data);
+}
+
+static inline u32 ioport__read32(u32 *data)
+{
+       return le32_to_cpu(*data);
+}
+
+static inline void ioport__write8(u8 *data, u8 value)
+{
+       *data            = value;
+}
+
+static inline void ioport__write16(u16 *data, u16 value)
+{
+       *data            = cpu_to_le16(value);
+}
+
+static inline void ioport__write32(u32 *data, u32 value)
+{
+       *data            = cpu_to_le32(value);
+}
+
+#endif /* KVM__IOPORT_H */
diff --git a/tools/kvm/include/kvm/irq.h b/tools/kvm/include/kvm/irq.h

new file mode 100644 (file)

index 0000000..43fecaf
--- /dev/null
+++ b/tools/kvm/include/kvm/irq.h
@@ -0,0 +1,33 @@
+#ifndef KVM__IRQ_H
+#define KVM__IRQ_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/kvm.h>
+
+#include "kvm/msi.h"
+
+struct kvm;
+
+struct irq_line {
+       u8                      line;
+       struct list_head        node;
+};
+
+struct pci_dev {
+       struct rb_node          node;
+       u32                     id;
+       u8                      pin;
+       struct list_head        lines;
+};
+
+int irq__register_device(u32 dev, u8 *num, u8 *pin, u8 *line);
+
+struct rb_node *irq__get_pci_tree(void);
+
+int irq__init(struct kvm *kvm);
+int irq__exit(struct kvm *kvm);
+int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg);
+
+#endif
diff --git a/tools/kvm/include/kvm/kvm-cmd.h b/tools/kvm/include/kvm/kvm-cmd.h

new file mode 100644 (file)

index 0000000..0a73bce
--- /dev/null
+++ b/tools/kvm/include/kvm/kvm-cmd.h
@@ -0,0 +1,17 @@
+#ifndef __KVM_CMD_H__
+#define __KVM_CMD_H__
+
+struct cmd_struct {
+       const char *cmd;
+       int (*fn)(int, const char **, const char *);
+       void (*help)(void);
+       int option;
+};
+
+extern struct cmd_struct kvm_commands[];
+struct cmd_struct *kvm_get_command(struct cmd_struct *command,
+                const char *cmd);
+
+int handle_command(struct cmd_struct *command, int argc, const char **argv);
+
+#endif
diff --git a/tools/kvm/include/kvm/kvm-cpu.h b/tools/kvm/include/kvm/kvm-cpu.h

new file mode 100644 (file)

index 0000000..d4448f6
--- /dev/null
+++ b/tools/kvm/include/kvm/kvm-cpu.h
@@ -0,0 +1,24 @@
+#ifndef KVM__KVM_CPU_H
+#define KVM__KVM_CPU_H
+
+#include "kvm/kvm-cpu-arch.h"
+#include <stdbool.h>
+
+struct kvm_cpu *kvm_cpu__init(struct kvm *kvm, unsigned long cpu_id);
+void kvm_cpu__delete(struct kvm_cpu *vcpu);
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu);
+void kvm_cpu__setup_cpuid(struct kvm_cpu *vcpu);
+void kvm_cpu__enable_singlestep(struct kvm_cpu *vcpu);
+void kvm_cpu__run(struct kvm_cpu *vcpu);
+void kvm_cpu__reboot(void);
+int kvm_cpu__start(struct kvm_cpu *cpu);
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu);
+
+int kvm_cpu__get_debug_fd(void);
+void kvm_cpu__set_debug_fd(int fd);
+void kvm_cpu__show_code(struct kvm_cpu *vcpu);
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu);
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu);
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu);
+
+#endif /* KVM__KVM_CPU_H */
diff --git a/tools/kvm/include/kvm/kvm-ipc.h b/tools/kvm/include/kvm/kvm-ipc.h

new file mode 100644 (file)

index 0000000..aefffa4
--- /dev/null
+++ b/tools/kvm/include/kvm/kvm-ipc.h
@@ -0,0 +1,24 @@
+#ifndef KVM__IPC_H_
+#define KVM__IPC_H_
+
+#include <linux/types.h>
+
+enum {
+       KVM_IPC_BALLOON = 1,
+       KVM_IPC_DEBUG   = 2,
+       KVM_IPC_STAT    = 3,
+       KVM_IPC_PAUSE   = 4,
+       KVM_IPC_RESUME  = 5,
+       KVM_IPC_STOP    = 6,
+       KVM_IPC_PID     = 7,
+       KVM_IPC_VMSTATE = 8,
+};
+
+int kvm_ipc__register_handler(u32 type, void (*cb)(int fd, u32 type, u32 len, u8 *msg));
+int kvm_ipc__start(int sock);
+int kvm_ipc__stop(void);
+
+int kvm_ipc__send(int fd, u32 type);
+int kvm_ipc__send_msg(int fd, u32 type, u32 len, u8 *msg);
+
+#endif
diff --git a/tools/kvm/include/kvm/kvm.h b/tools/kvm/include/kvm/kvm.h

new file mode 100644 (file)

index 0000000..22a1b0e
--- /dev/null
+++ b/tools/kvm/include/kvm/kvm.h
@@ -0,0 +1,93 @@
+#ifndef KVM__KVM_H
+#define KVM__KVM_H
+
+#include "kvm/kvm-arch.h"
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <time.h>
+#include <signal.h>
+
+#define SIGKVMEXIT             (SIGRTMIN + 0)
+#define SIGKVMPAUSE            (SIGRTMIN + 1)
+
+#define KVM_PID_FILE_PATH      "/.lkvm/"
+#define HOME_DIR               getenv("HOME")
+#define KVM_BINARY_NAME                "lkvm"
+
+#define PAGE_SIZE (sysconf(_SC_PAGE_SIZE))
+
+#define DEFINE_KVM_EXT(ext)            \
+       .name = #ext,                   \
+       .code = ext
+
+enum {
+       KVM_VMSTATE_RUNNING,
+       KVM_VMSTATE_PAUSED,
+};
+
+struct kvm_ext {
+       const char *name;
+       int code;
+};
+
+void kvm__set_dir(const char *fmt, ...);
+const char *kvm__get_dir(void);
+
+struct kvm *kvm__init(const char *kvm_dev, const char *hugetlbfs_path, u64 ram_size, const char *name);
+int kvm__recommended_cpus(struct kvm *kvm);
+int kvm__max_cpus(struct kvm *kvm);
+void kvm__init_ram(struct kvm *kvm);
+int kvm__exit(struct kvm *kvm);
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename);
+bool kvm__load_kernel(struct kvm *kvm, const char *kernel_filename,
+                       const char *initrd_filename, const char *kernel_cmdline, u16 vidmode);
+void kvm__start_timer(struct kvm *kvm);
+void kvm__stop_timer(struct kvm *kvm);
+void kvm__irq_line(struct kvm *kvm, int irq, int level);
+void kvm__irq_trigger(struct kvm *kvm, int irq);
+bool kvm__emulate_io(struct kvm *kvm, u16 port, void *data, int direction, int size, u32 count);
+bool kvm__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write);
+int kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 size, void *userspace_addr);
+int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool coalesce,
+                       void (*mmio_fn)(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr),
+                       void *ptr);
+bool kvm__deregister_mmio(struct kvm *kvm, u64 phys_addr);
+void kvm__pause(void);
+void kvm__continue(void);
+void kvm__notify_paused(void);
+int kvm__get_sock_by_instance(const char *name);
+int kvm__enumerate_instances(int (*callback)(const char *name, int pid));
+void kvm__remove_socket(const char *name);
+
+void kvm__arch_set_cmdline(char *cmdline, bool video);
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size);
+void kvm__arch_delete_ram(struct kvm *kvm);
+int kvm__arch_setup_firmware(struct kvm *kvm);
+int kvm__arch_free_firmware(struct kvm *kvm);
+bool kvm__arch_cpu_supports_vm(void);
+void kvm__arch_periodic_poll(struct kvm *kvm);
+
+int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline);
+bool load_bzimage(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline, u16 vidmode);
+
+/*
+ * Debugging
+ */
+void kvm__dump_mem(struct kvm *kvm, unsigned long addr, unsigned long size);
+
+extern const char *kvm_exit_reasons[];
+
+static inline bool host_ptr_in_ram(struct kvm *kvm, void *p)
+{
+       return kvm->ram_start <= p && p < (kvm->ram_start + kvm->ram_size);
+}
+
+static inline void *guest_flat_to_host(struct kvm *kvm, unsigned long offset)
+{
+       return kvm->ram_start + offset;
+}
+
+bool kvm__supports_extension(struct kvm *kvm, unsigned int extension);
+
+#endif /* KVM__KVM_H */
diff --git a/tools/kvm/include/kvm/msi.h b/tools/kvm/include/kvm/msi.h

new file mode 100644 (file)

index 0000000..885eb5b
--- /dev/null
+++ b/tools/kvm/include/kvm/msi.h
@@ -0,0 +1,10 @@
+#ifndef LKVM_MSI_H
+#define LKVM_MSI_H
+
+struct msi_msg {
+       u32     address_lo;     /* low 32 bits of msi message address */
+       u32     address_hi;     /* high 32 bits of msi message address */
+       u32     data;           /* 16 bits of msi message data */
+};
+
+#endif /* LKVM_MSI_H */
diff --git a/tools/kvm/include/kvm/mutex.h b/tools/kvm/include/kvm/mutex.h

new file mode 100644 (file)

index 0000000..3286cea
--- /dev/null
+++ b/tools/kvm/include/kvm/mutex.h
@@ -0,0 +1,33 @@
+#ifndef KVM__MUTEX_H
+#define KVM__MUTEX_H
+
+#include <pthread.h>
+
+#include "kvm/util.h"
+
+/*
+ * Kernel-alike mutex API - to make it easier for kernel developers
+ * to write user-space code! :-)
+ */
+
+#define DEFINE_MUTEX(mutex) pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER
+
+static inline void mutex_init(pthread_mutex_t *mutex)
+{
+       if (pthread_mutex_init(mutex, NULL) != 0)
+               die("unexpected pthread_mutex_init() failure!");
+}
+
+static inline void mutex_lock(pthread_mutex_t *mutex)
+{
+       if (pthread_mutex_lock(mutex) != 0)
+               die("unexpected pthread_mutex_lock() failure!");
+}
+
+static inline void mutex_unlock(pthread_mutex_t *mutex)
+{
+       if (pthread_mutex_unlock(mutex) != 0)
+               die("unexpected pthread_mutex_unlock() failure!");
+}
+
+#endif /* KVM__MUTEX_H */
diff --git a/tools/kvm/include/kvm/parse-options.h b/tools/kvm/include/kvm/parse-options.h

new file mode 100644 (file)

index 0000000..7886ff7
--- /dev/null
+++ b/tools/kvm/include/kvm/parse-options.h
@@ -0,0 +1,214 @@
+#ifndef __PARSE_OPTIONS_H__
+#define __PARSE_OPTIONS_H__
+
+#include <inttypes.h>
+#include <kvm/util.h>
+
+enum parse_opt_type {
+       /* special types */
+       OPTION_END,
+       OPTION_ARGUMENT,
+       OPTION_GROUP,
+       /* options with no arguments */
+       OPTION_BIT,
+       OPTION_BOOLEAN,
+       OPTION_INCR,
+       OPTION_SET_UINT,
+       OPTION_SET_PTR,
+       /* options with arguments (usually) */
+       OPTION_STRING,
+       OPTION_INTEGER,
+       OPTION_LONG,
+       OPTION_CALLBACK,
+       OPTION_U64,
+       OPTION_UINTEGER,
+};
+
+enum parse_opt_flags {
+       PARSE_OPT_KEEP_DASHDASH = 1,
+       PARSE_OPT_STOP_AT_NON_OPTION = 2,
+       PARSE_OPT_KEEP_ARGV0 = 4,
+       PARSE_OPT_KEEP_UNKNOWN = 8,
+       PARSE_OPT_NO_INTERNAL_HELP = 16,
+};
+
+enum parse_opt_option_flags {
+       PARSE_OPT_OPTARG  = 1,
+       PARSE_OPT_NOARG   = 2,
+       PARSE_OPT_NONEG   = 4,
+       PARSE_OPT_HIDDEN  = 8,
+       PARSE_OPT_LASTARG_DEFAULT = 16,
+};
+
+struct option;
+typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
+/*
+ * `type`::
+ *   holds the type of the option, you must have an OPTION_END last in your
+ *   array.
+ *
+ * `short_name`::
+ *   the character to use as a short option name, '\0' if none.
+ *
+ * `long_name`::
+ *   the long option name, without the leading dashes, NULL if none.
+ *
+ * `value`::
+ *   stores pointers to the values to be filled.
+ *
+ * `argh`::
+ *   token to explain the kind of argument this option wants. Keep it
+ *   homogenous across the repository.
+ *
+ * `help`::
+ *   the short help associated to what the option does.
+ *   Must never be NULL (except for OPTION_END).
+ *   OPTION_GROUP uses this pointer to store the group header.
+ *
+ * `flags`::
+ *   mask of parse_opt_option_flags.
+ *   PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs)
+ *   PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs
+ *   PARSE_OPT_NONEG: says that this option cannot be negated
+ *   PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in
+ *                    the long one.
+ *
+ * `callback`::
+ *   pointer to the callback to use for OPTION_CALLBACK.
+ *
+ * `defval`::
+ *   default value to fill (*->value) with for PARSE_OPT_OPTARG.
+ *   OPTION_{BIT,SET_UINT,SET_PTR} store the {mask,integer,pointer} to put in
+ *   the value when met.
+ *   CALLBACKS can use it like they want.
+ */
+struct option {
+enum parse_opt_type type;
+int short_name;
+const char *long_name;
+void *value;
+const char *argh;
+const char *help;
+
+int flags;
+parse_opt_cb *callback;
+intptr_t defval;
+};
+
+#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
+#define check_vtype(v, type) \
+       (BUILD_BUG_ON_ZERO(!__builtin_types_compatible_p(typeof(v), type)) + v)
+
+#define OPT_INTEGER(s, l, v, h)             \
+{                                           \
+       .type = OPTION_INTEGER,             \
+       .short_name = (s),                  \
+       .long_name = (l),                   \
+       .value = check_vtype(v, int *),     \
+       .help = (h)                         \
+}
+
+#define OPT_U64(s, l, v, h)                 \
+{                                           \
+       .type = OPTION_U64,                 \
+       .short_name = (s),                  \
+       .long_name = (l),                   \
+       .value = check_vtype(v, u64 *),     \
+       .help = (h)                         \
+}
+
+#define OPT_STRING(s, l, v, a, h)           \
+{                                           \
+       .type = OPTION_STRING,              \
+       .short_name = (s),                  \
+       .long_name = (l),                   \
+       .value = check_vtype(v, const char **), (a), \
+       .help = (h)                         \
+}
+
+#define OPT_BOOLEAN(s, l, v, h)             \
+{                                           \
+       .type = OPTION_BOOLEAN,             \
+       .short_name = (s),                  \
+       .long_name = (l),                   \
+       .value = check_vtype(v, bool *),    \
+       .help = (h)                         \
+}
+
+#define OPT_INCR(s, l, v, h)                \
+{                                           \
+       .type = OPTION_INCR,                \
+       .short_name = (s),                  \
+       .long_name = (l),                   \
+       .value = check_vtype(v, int *),     \
+       .help = (h)                         \
+}
+
+#define OPT_GROUP(h)                        \
+{                                           \
+       .type = OPTION_GROUP,               \
+       .help = (h)                         \
+}
+
+#define OPT_CALLBACK(s, l, v, a, h, f)      \
+{                                          \
+       .type = OPTION_CALLBACK,            \
+       .short_name = (s),                  \
+       .long_name = (l),                   \
+       .value = (v),                       \
+       (a),                                \
+       .help = (h),                        \
+       .callback = (f)                     \
+}
+
+#define OPT_CALLBACK_NOOPT(s, l, v, a, h, f) \
+{                                          \
+       .type = OPTION_CALLBACK,            \
+       .short_name = (s),                  \
+       .long_name = (l),                   \
+       .value = (v),                       \
+       (a),                                \
+       .help = (h),                        \
+       .callback = (f),                    \
+       .flags = PARSE_OPT_NOARG            \
+}
+
+#define OPT_CALLBACK_DEFAULT(s, l, v, a, h, f, d) \
+{                                          \
+       .type = OPTION_CALLBACK,            \
+       .short_name = (s),                  \
+       .long_name = (l),                   \
+       .value = (v), (a),                  \
+       .help = (h),                        \
+       .callback = (f),                    \
+       .defval = (intptr_t)d,              \
+       .flags = PARSE_OPT_LASTARG_DEFAULT  \
+}
+
+#define OPT_END() { .type = OPTION_END }
+
+enum {
+       PARSE_OPT_HELP = -1,
+       PARSE_OPT_DONE,
+       PARSE_OPT_UNKNOWN,
+};
+
+/*
+ * It's okay for the caller to consume argv/argc in the usual way.
+ * Other fields of that structure are private to parse-options and should not
+ * be modified in any way.
+ **/
+struct parse_opt_ctx_t {
+       const char **argv;
+       const char **out;
+       int argc, cpidx;
+       const char *opt;
+       int flags;
+};
+
+/* global functions */
+void usage_with_options(const char * const *usagestr,
+               const struct option *opts) NORETURN;
+int parse_options(int argc, const char **argv, const struct option *options,
+               const char * const usagestr[], int flags);
+#endif
diff --git a/tools/kvm/include/kvm/pci-shmem.h b/tools/kvm/include/kvm/pci-shmem.h

new file mode 100644 (file)

index 0000000..599ab37
--- /dev/null
+++ b/tools/kvm/include/kvm/pci-shmem.h
@@ -0,0 +1,28 @@
+#ifndef KVM__PCI_SHMEM_H
+#define KVM__PCI_SHMEM_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+#define SHMEM_DEFAULT_SIZE (16 << MB_SHIFT)
+#define SHMEM_DEFAULT_ADDR (0xc8000000)
+#define SHMEM_DEFAULT_HANDLE "/kvm_shmem"
+
+struct kvm;
+struct shmem_info;
+
+struct shmem_info {
+       u64 phys_addr;
+       u64 size;
+       char *handle;
+       int create;
+};
+
+int pci_shmem__init(struct kvm *self);
+int pci_shmem__register_mem(struct shmem_info *si);
+
+int pci_shmem__get_local_irqfd(struct kvm *kvm);
+int pci_shmem__add_client(struct kvm *kvm, u32 id, int fd);
+int pci_shmem__remove_client(struct kvm *kvm, u32 id);
+
+#endif
diff --git a/tools/kvm/include/kvm/pci.h b/tools/kvm/include/kvm/pci.h

new file mode 100644 (file)

index 0000000..26639b5
--- /dev/null
+++ b/tools/kvm/include/kvm/pci.h
@@ -0,0 +1,95 @@
+#ifndef KVM__PCI_H
+#define KVM__PCI_H
+
+#include <linux/types.h>
+#include <linux/kvm.h>
+#include <linux/pci_regs.h>
+#include <endian.h>
+
+#include "kvm/kvm.h"
+#include "kvm/msi.h"
+
+#define PCI_MAX_DEVICES                256
+/*
+ * PCI Configuration Mechanism #1 I/O ports. See Section 3.7.4.1.
+ * ("Configuration Mechanism #1") of the PCI Local Bus Specification 2.1 for
+ * details.
+ */
+#define PCI_CONFIG_ADDRESS     0xcf8
+#define PCI_CONFIG_DATA                0xcfc
+#define PCI_CONFIG_BUS_FORWARD 0xcfa
+#define PCI_IO_SIZE            0x100
+
+union pci_config_address {
+       struct {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+               unsigned        reg_offset      : 2;            /* 1  .. 0  */
+               unsigned        register_number : 6;            /* 7  .. 2  */
+               unsigned        function_number : 3;            /* 10 .. 8  */
+               unsigned        device_number   : 5;            /* 15 .. 11 */
+               unsigned        bus_number      : 8;            /* 23 .. 16 */
+               unsigned        reserved        : 7;            /* 30 .. 24 */
+               unsigned        enable_bit      : 1;            /* 31       */
+#else
+               unsigned        enable_bit      : 1;            /* 31       */
+               unsigned        reserved        : 7;            /* 30 .. 24 */
+               unsigned        bus_number      : 8;            /* 23 .. 16 */
+               unsigned        device_number   : 5;            /* 15 .. 11 */
+               unsigned        function_number : 3;            /* 10 .. 8  */
+               unsigned        register_number : 6;            /* 7  .. 2  */
+               unsigned        reg_offset      : 2;            /* 1  .. 0  */
+#endif
+       };
+       u32 w;
+};
+
+struct msix_table {
+       struct msi_msg msg;
+       u32 ctrl;
+};
+
+struct msix_cap {
+       u8 cap;
+       u8 next;
+       u16 ctrl;
+       u32 table_offset;
+       u32 pba_offset;
+};
+
+struct pci_device_header {
+       u16             vendor_id;
+       u16             device_id;
+       u16             command;
+       u16             status;
+       u8              revision_id;
+       u8              class[3];
+       u8              cacheline_size;
+       u8              latency_timer;
+       u8              header_type;
+       u8              bist;
+       u32             bar[6];
+       u32             card_bus;
+       u16             subsys_vendor_id;
+       u16             subsys_id;
+       u32             exp_rom_bar;
+       u8              capabilities;
+       u8              reserved1[3];
+       u32             reserved2;
+       u8              irq_line;
+       u8              irq_pin;
+       u8              min_gnt;
+       u8              max_lat;
+       struct msix_cap msix;
+       u8              empty[136]; /* Rest of PCI config space */
+       u32             bar_size[6];
+} __attribute__((packed));
+
+int pci__init(struct kvm *kvm);
+int pci__exit(struct kvm *kvm);
+int pci__register(struct pci_device_header *dev, u8 dev_num);
+struct pci_device_header *pci__find_dev(u8 dev_num);
+u32 pci_get_io_space_block(u32 size);
+void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size);
+void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size);
+
+#endif /* KVM__PCI_H */
diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h

new file mode 100644 (file)

index 0000000..e032a1e
--- /dev/null
+++ b/tools/kvm/include/kvm/qcow.h
@@ -0,0 +1,133 @@
+#ifndef KVM__QCOW_H
+#define KVM__QCOW_H
+
+#include "kvm/mutex.h"
+
+#include <linux/types.h>
+#include <stdbool.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+
+#define QCOW_MAGIC             (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+
+#define QCOW1_VERSION          1
+#define QCOW2_VERSION          2
+
+#define QCOW1_OFLAG_COMPRESSED (1ULL << 63)
+
+#define QCOW2_OFLAG_COPIED     (1ULL << 63)
+#define QCOW2_OFLAG_COMPRESSED (1ULL << 62)
+
+#define QCOW2_OFLAGS_MASK      (QCOW2_OFLAG_COPIED|QCOW2_OFLAG_COMPRESSED)
+
+#define QCOW2_OFFSET_MASK      (~QCOW2_OFLAGS_MASK)
+
+#define MAX_CACHE_NODES         32
+
+struct qcow_l2_table {
+       u64                             offset;
+       struct rb_node                  node;
+       struct list_head                list;
+       u8                              dirty;
+       u64                             table[];
+};
+
+struct qcow_l1_table {
+       u32                             table_size;
+       u64                             *l1_table;
+
+       /* Level2 caching data structures */
+       struct rb_root                  root;
+       struct list_head                lru_list;
+       int                             nr_cached;
+};
+
+#define QCOW_REFCOUNT_BLOCK_SHIFT      1
+
+struct qcow_refcount_block {
+       u64                             offset;
+       struct rb_node                  node;
+       struct list_head                list;
+       u64                             size;
+       u8                              dirty;
+       u16                             entries[];
+};
+
+struct qcow_refcount_table {
+       u32                             rf_size;
+       u64                             *rf_table;
+
+       /* Refcount block caching data structures */
+       struct rb_root                  root;
+       struct list_head                lru_list;
+       int                             nr_cached;
+};
+
+struct qcow_header {
+       u64                             size;   /* in bytes */
+       u64                             l1_table_offset;
+       u32                             l1_size;
+       u8                              cluster_bits;
+       u8                              l2_bits;
+       u64                             refcount_table_offset;
+       u32                             refcount_table_size;
+};
+
+struct qcow {
+       pthread_mutex_t                 mutex;
+       struct qcow_header              *header;
+       struct qcow_l1_table            table;
+       struct qcow_refcount_table      refcount_table;
+       int                             fd;
+       int                             csize_shift;
+       int                             csize_mask;
+       u32                             version;
+       u64                             cluster_size;
+       u64                             cluster_offset_mask;
+       u64                             free_clust_idx;
+       void                            *cluster_cache;
+       void                            *cluster_data;
+       void                            *copy_buff;
+};
+
+struct qcow1_header_disk {
+       u32                             magic;
+       u32                             version;
+
+       u64                             backing_file_offset;
+       u32                             backing_file_size;
+       u32                             mtime;
+
+       u64                             size;   /* in bytes */
+
+       u8                              cluster_bits;
+       u8                              l2_bits;
+       u32                             crypt_method;
+
+       u64                             l1_table_offset;
+};
+
+struct qcow2_header_disk {
+       u32                             magic;
+       u32                             version;
+
+       u64                             backing_file_offset;
+       u32                             backing_file_size;
+
+       u32                             cluster_bits;
+       u64                             size;   /* in bytes */
+       u32                             crypt_method;
+
+       u32                             l1_size;
+       u64                             l1_table_offset;
+
+       u64                             refcount_table_offset;
+       u32                             refcount_table_clusters;
+
+       u32                             nb_snapshots;
+       u64                             snapshots_offset;
+};
+
+struct disk_image *qcow_probe(int fd, bool readonly);
+
+#endif /* KVM__QCOW_H */
diff --git a/tools/kvm/include/kvm/rbtree-interval.h b/tools/kvm/include/kvm/rbtree-interval.h

new file mode 100644 (file)

index 0000000..13245ba
--- /dev/null
+++ b/tools/kvm/include/kvm/rbtree-interval.h
@@ -0,0 +1,28 @@
+#ifndef KVM__INTERVAL_RBTREE_H
+#define KVM__INTERVAL_RBTREE_H
+
+#include <linux/rbtree.h>
+#include <linux/types.h>
+
+#define RB_INT_INIT(l, h) (struct rb_int_node){.low = l, .high = h}
+#define rb_int(n) rb_entry(n, struct rb_int_node, node)
+
+struct rb_int_node {
+       struct rb_node  node;
+       u64             low;
+       u64             high;
+
+       /* max_high will store the highest high of it's 2 children. */
+       u64             max_high;
+};
+
+/* Return the rb_int_node interval in which 'point' is located. */
+struct rb_int_node *rb_int_search_single(struct rb_root *root, u64 point);
+
+/* Return the rb_int_node in which start:len is located. */
+struct rb_int_node *rb_int_search_range(struct rb_root *root, u64 low, u64 high);
+
+int rb_int_insert(struct rb_root *root, struct rb_int_node *data);
+void rb_int_erase(struct rb_root *root, struct rb_int_node *node);
+
+#endif
diff --git a/tools/kvm/include/kvm/read-write.h b/tools/kvm/include/kvm/read-write.h

new file mode 100644 (file)

index 0000000..67571f9
--- /dev/null
+++ b/tools/kvm/include/kvm/read-write.h
@@ -0,0 +1,43 @@
+#ifndef KVM_READ_WRITE_H
+#define KVM_READ_WRITE_H
+
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#ifdef CONFIG_HAS_AIO
+#include <libaio.h>
+#endif
+
+ssize_t xread(int fd, void *buf, size_t count);
+ssize_t xwrite(int fd, const void *buf, size_t count);
+
+ssize_t read_in_full(int fd, void *buf, size_t count);
+ssize_t write_in_full(int fd, const void *buf, size_t count);
+
+ssize_t xpread(int fd, void *buf, size_t count, off_t offset);
+ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset);
+
+ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset);
+ssize_t pwrite_in_full(int fd, const void *buf, size_t count, off_t offset);
+
+ssize_t xreadv(int fd, const struct iovec *iov, int iovcnt);
+ssize_t xwritev(int fd, const struct iovec *iov, int iovcnt);
+
+ssize_t readv_in_full(int fd, const struct iovec *iov, int iovcnt);
+ssize_t writev_in_full(int fd, const struct iovec *iov, int iovcnt);
+
+ssize_t xpreadv(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+ssize_t xpwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+
+ssize_t preadv_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+ssize_t pwritev_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+
+#ifdef CONFIG_HAS_AIO
+int aio_preadv(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt,
+               off_t offset, int ev, void *param);
+int aio_pwritev(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt,
+               off_t offset, int ev, void *param);
+#endif
+
+#endif /* KVM_READ_WRITE_H */
diff --git a/tools/kvm/include/kvm/rtc.h b/tools/kvm/include/kvm/rtc.h

new file mode 100644 (file)

index 0000000..6aa9299
--- /dev/null
+++ b/tools/kvm/include/kvm/rtc.h
@@ -0,0 +1,9 @@
+#ifndef KVM__RTC_H
+#define KVM__RTC_H
+
+struct kvm;
+
+int rtc__init(struct kvm *kvm);
+int rtc__exit(struct kvm *kvm);
+
+#endif /* KVM__RTC_H */
diff --git a/tools/kvm/include/kvm/rwsem.h b/tools/kvm/include/kvm/rwsem.h

new file mode 100644 (file)

index 0000000..75a22f8
--- /dev/null
+++ b/tools/kvm/include/kvm/rwsem.h
@@ -0,0 +1,39 @@
+#ifndef KVM__RWSEM_H
+#define KVM__RWSEM_H
+
+#include <pthread.h>
+
+#include "kvm/util.h"
+
+/*
+ * Kernel-alike rwsem API - to make it easier for kernel developers
+ * to write user-space code! :-)
+ */
+
+#define DECLARE_RWSEM(sem) pthread_rwlock_t sem = PTHREAD_RWLOCK_INITIALIZER
+
+static inline void down_read(pthread_rwlock_t *rwsem)
+{
+       if (pthread_rwlock_rdlock(rwsem) != 0)
+               die("unexpected pthread_rwlock_rdlock() failure!");
+}
+
+static inline void down_write(pthread_rwlock_t *rwsem)
+{
+       if (pthread_rwlock_wrlock(rwsem) != 0)
+               die("unexpected pthread_rwlock_wrlock() failure!");
+}
+
+static inline void up_read(pthread_rwlock_t *rwsem)
+{
+       if (pthread_rwlock_unlock(rwsem) != 0)
+               die("unexpected pthread_rwlock_unlock() failure!");
+}
+
+static inline void up_write(pthread_rwlock_t *rwsem)
+{
+       if (pthread_rwlock_unlock(rwsem) != 0)
+               die("unexpected pthread_rwlock_unlock() failure!");
+}
+
+#endif /* KVM__RWSEM_H */
diff --git a/tools/kvm/include/kvm/sdl.h b/tools/kvm/include/kvm/sdl.h

new file mode 100644 (file)

index 0000000..36e5986
--- /dev/null
+++ b/tools/kvm/include/kvm/sdl.h
@@ -0,0 +1,22 @@
+#ifndef KVM__SDL_H
+#define KVM__SDL_H
+
+#include "kvm/util.h"
+
+struct framebuffer;
+
+#ifdef CONFIG_HAS_SDL
+int sdl__init(struct framebuffer *fb);
+int sdl__exit(struct framebuffer *fb);
+#else
+static inline void sdl__init(struct framebuffer *fb)
+{
+       die("SDL support not compiled in. (install the SDL-dev[el] package)");
+}
+static inline void sdl__exit(struct framebuffer *fb)
+{
+       die("SDL support not compiled in. (install the SDL-dev[el] package)");
+}
+#endif
+
+#endif /* KVM__SDL_H */
diff --git a/tools/kvm/include/kvm/segment.h b/tools/kvm/include/kvm/segment.h

new file mode 100644 (file)

index 0000000..9387a82
--- /dev/null
+++ b/tools/kvm/include/kvm/segment.h
@@ -0,0 +1,21 @@
+#ifndef KVM_SEGMENT_H
+#define KVM_SEGMENT_H
+
+#include <linux/types.h>
+
+static inline u32 segment_to_flat(u16 selector, u16 offset)
+{
+       return ((u32)selector << 4) + (u32) offset;
+}
+
+static inline u16 flat_to_seg16(u32 address)
+{
+       return address >> 4;
+}
+
+static inline u16 flat_to_off16(u32 address, u32 segment)
+{
+       return address - (segment << 4);
+}
+
+#endif /* KVM_SEGMENT_H */
diff --git a/tools/kvm/include/kvm/strbuf.h b/tools/kvm/include/kvm/strbuf.h

new file mode 100644 (file)

index 0000000..2beefbc
--- /dev/null
+++ b/tools/kvm/include/kvm/strbuf.h
@@ -0,0 +1,20 @@
+#ifndef __STRBUF_H__
+#define __STRBUF_H__
+
+#include <sys/types.h>
+#include <string.h>
+
+int prefixcmp(const char *str, const char *prefix);
+
+extern size_t strlcat(char *dest, const char *src, size_t count);
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+
+/* some inline functions */
+
+static inline const char *skip_prefix(const char *str, const char *prefix)
+{
+       size_t len = strlen(prefix);
+       return strncmp(str, prefix, len) ? NULL : str + len;
+}
+
+#endif
diff --git a/tools/kvm/include/kvm/symbol.h b/tools/kvm/include/kvm/symbol.h

new file mode 100644 (file)

index 0000000..725bbaf
--- /dev/null
+++ b/tools/kvm/include/kvm/symbol.h
@@ -0,0 +1,30 @@
+#ifndef KVM__SYMBOL_H
+#define KVM__SYMBOL_H
+
+#include <stddef.h>
+#include <string.h>
+
+struct kvm;
+
+#define SYMBOL_DEFAULT_UNKNOWN "<unknown>"
+
+#ifdef CONFIG_HAS_BFD
+
+int symbol_init(struct kvm *kvm);
+int symbol_exit(struct kvm *kvm);
+char *symbol_lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size);
+
+#else
+
+static inline int symbol_init(struct kvm *kvm) { return 0; }
+static inline char *symbol_lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size)
+{
+       char *s = strncpy(sym, SYMBOL_DEFAULT_UNKNOWN, size);
+       sym[size - 1] = '\0';
+       return s;
+}
+static inline int symbol_exit(struct kvm *kvm) { return 0; }
+
+#endif
+
+#endif /* KVM__SYMBOL_H */
diff --git a/tools/kvm/include/kvm/term.h b/tools/kvm/include/kvm/term.h

new file mode 100644 (file)

index 0000000..a6a9822
--- /dev/null
+++ b/tools/kvm/include/kvm/term.h
@@ -0,0 +1,20 @@
+#ifndef KVM__TERM_H
+#define KVM__TERM_H
+
+#include <sys/uio.h>
+#include <stdbool.h>
+
+#define CONSOLE_8250   1
+#define CONSOLE_VIRTIO 2
+#define CONSOLE_HV     3
+
+int term_putc_iov(int who, struct iovec *iov, int iovcnt, int term);
+int term_getc_iov(int who, struct iovec *iov, int iovcnt, int term);
+int term_putc(int who, char *addr, int cnt, int term);
+int term_getc(int who, int term);
+
+bool term_readable(int who, int term);
+void term_set_tty(int term);
+void term_init(void);
+
+#endif /* KVM__TERM_H */
diff --git a/tools/kvm/include/kvm/threadpool.h b/tools/kvm/include/kvm/threadpool.h

new file mode 100644 (file)

index 0000000..768239f
--- /dev/null
+++ b/tools/kvm/include/kvm/threadpool.h
@@ -0,0 +1,37 @@
+#ifndef KVM__THREADPOOL_H
+#define KVM__THREADPOOL_H
+
+#include "kvm/mutex.h"
+
+#include <linux/list.h>
+
+struct kvm;
+
+typedef void (*kvm_thread_callback_fn_t)(struct kvm *kvm, void *data);
+
+struct thread_pool__job {
+       kvm_thread_callback_fn_t        callback;
+       struct kvm                      *kvm;
+       void                            *data;
+
+       int                             signalcount;
+       pthread_mutex_t                 mutex;
+
+       struct list_head                queue;
+};
+
+static inline void thread_pool__init_job(struct thread_pool__job *job, struct kvm *kvm, kvm_thread_callback_fn_t callback, void *data)
+{
+       *job = (struct thread_pool__job) {
+               .kvm            = kvm,
+               .callback       = callback,
+               .data           = data,
+               .mutex          = PTHREAD_MUTEX_INITIALIZER,
+       };
+}
+
+int thread_pool__init(unsigned long thread_count);
+
+void thread_pool__do_job(struct thread_pool__job *job);
+
+#endif
diff --git a/tools/kvm/include/kvm/types.h b/tools/kvm/include/kvm/types.h

new file mode 100644 (file)

index 0000000..0cbc5fb
--- /dev/null
+++ b/tools/kvm/include/kvm/types.h
@@ -0,0 +1,7 @@
+#ifndef KVM_TYPES_H
+#define KVM_TYPES_H
+
+/* FIXME: include/linux/if_tun.h and include/linux/if_ether.h complains */
+#define __be16 u16
+
+#endif /* KVM_TYPES_H */
diff --git a/tools/kvm/include/kvm/uip.h b/tools/kvm/include/kvm/uip.h

new file mode 100644 (file)

index 0000000..9af0110
--- /dev/null
+++ b/tools/kvm/include/kvm/uip.h
@@ -0,0 +1,360 @@
+#ifndef KVM__UIP_H
+#define KVM__UIP_H
+
+#include "linux/types.h"
+#include "kvm/mutex.h"
+
+#include <netinet/in.h>
+#include <sys/uio.h>
+
+#define UIP_BUF_STATUS_FREE    0
+#define UIP_BUF_STATUS_INUSE   1
+#define UIP_BUF_STATUS_USED    2
+
+#define UIP_ETH_P_IP           0X0800
+#define UIP_ETH_P_ARP          0X0806
+
+#define UIP_IP_VER_4           0X40
+#define UIP_IP_HDR_LEN         0X05
+#define UIP_IP_TTL             0X40
+#define UIP_IP_P_UDP           0X11
+#define UIP_IP_P_TCP           0X06
+#define UIP_IP_P_ICMP          0X01
+
+#define UIP_TCP_HDR_LEN                0x50
+#define UIP_TCP_WIN_SIZE       14600
+#define UIP_TCP_FLAG_FIN       1
+#define UIP_TCP_FLAG_SYN       2
+#define UIP_TCP_FLAG_RST       4
+#define UIP_TCP_FLAG_PSH       8
+#define UIP_TCP_FLAG_ACK       16
+#define UIP_TCP_FLAG_URG       32
+
+#define UIP_BOOTP_VENDOR_SPECIFIC_LEN  64
+#define UIP_BOOTP_MAX_PAYLOAD_LEN      300
+#define UIP_DHCP_VENDOR_SPECIFIC_LEN   312
+#define UIP_DHCP_PORT_SERVER           67
+#define UIP_DHCP_PORT_CLIENT           68
+#define UIP_DHCP_MACPAD_LEN            10
+#define UIP_DHCP_HOSTNAME_LEN          64
+#define UIP_DHCP_FILENAME_LEN          128
+#define UIP_DHCP_MAGIC_COOKIE          0x63825363
+#define UIP_DHCP_MAGIC_COOKIE_LEN      4
+#define UIP_DHCP_LEASE_TIME            0x00003840
+#define UIP_DHCP_MAX_PAYLOAD_LEN       (UIP_BOOTP_MAX_PAYLOAD_LEN - UIP_BOOTP_VENDOR_SPECIFIC_LEN +  UIP_DHCP_VENDOR_SPECIFIC_LEN)
+#define UIP_DHCP_OPTION_LEN            (UIP_DHCP_VENDOR_SPECIFIC_LEN - UIP_DHCP_MAGIC_COOKIE_LEN)
+#define UIP_DHCP_DISCOVER              1
+#define UIP_DHCP_OFFER                 2
+#define UIP_DHCP_REQUEST               3
+#define UIP_DHCP_ACK                   5
+#define UIP_DHCP_MAX_DNS_SERVER_NR     3
+#define UIP_DHCP_MAX_DOMAIN_NAME_LEN   256
+#define UIP_DHCP_TAG_MSG_TYPE          53
+#define UIP_DHCP_TAG_MSG_TYPE_LEN      1
+#define UIP_DHCP_TAG_SERVER_ID         54
+#define UIP_DHCP_TAG_SERVER_ID_LEN     4
+#define UIP_DHCP_TAG_LEASE_TIME                51
+#define UIP_DHCP_TAG_LEASE_TIME_LEN    4
+#define UIP_DHCP_TAG_SUBMASK           1
+#define UIP_DHCP_TAG_SUBMASK_LEN       4
+#define UIP_DHCP_TAG_ROUTER            3
+#define UIP_DHCP_TAG_ROUTER_LEN                4
+#define UIP_DHCP_TAG_ROOT              17
+#define UIP_DHCP_TAG_ROOT_LEN          4
+#define UIP_DHCP_TAG_DNS_SERVER                6
+#define UIP_DHCP_TAG_DNS_SERVER_LEN    4
+#define UIP_DHCP_TAG_DOMAIN_NAME       15
+#define UIP_DHCP_TAG_END               255
+
+/*
+ * IP package maxium len == 64 KBytes
+ * IP header == 20 Bytes
+ * TCP header == 20 Bytes
+ * UDP header == 8 Bytes
+ */
+#define UIP_MAX_TCP_PAYLOAD    (64*1024 - 20 - 20 - 1)
+#define UIP_MAX_UDP_PAYLOAD    (64*1024 - 20 -  8 - 1)
+
+struct uip_eth_addr {
+       u8 addr[6];
+};
+
+struct uip_eth {
+       struct uip_eth_addr dst;
+       struct uip_eth_addr src;
+       u16 type;
+} __attribute__((packed));
+
+struct uip_arp {
+       struct uip_eth eth;
+       u16 hwtype;
+       u16 proto;
+       u8 hwlen;
+       u8 protolen;
+       u16 op;
+       struct uip_eth_addr smac;
+       u32 sip;
+       struct uip_eth_addr dmac;
+       u32 dip;
+} __attribute__((packed));
+
+struct uip_ip {
+       struct uip_eth eth;
+       u8 vhl;
+       u8 tos;
+       /*
+        * len = IP hdr +  IP payload
+        */
+       u16 len;
+       u16 id;
+       u16 flgfrag;
+       u8 ttl;
+       u8 proto;
+       u16 csum;
+       u32 sip;
+       u32 dip;
+} __attribute__((packed));
+
+struct uip_icmp {
+       struct uip_ip ip;
+       u8 type;
+       u8 code;
+       u16 csum;
+       u16 id;
+       u16 seq;
+} __attribute__((packed));
+
+struct uip_udp {
+       /*
+        * FIXME: IP Options (IP hdr len > 20 bytes) are not supported
+        */
+       struct uip_ip ip;
+       u16 sport;
+       u16 dport;
+       /*
+        * len = UDP hdr +  UDP payload
+        */
+       u16 len;
+       u16 csum;
+       u8 payload[0];
+} __attribute__((packed));
+
+struct uip_tcp {
+       /*
+        * FIXME: IP Options (IP hdr len > 20 bytes) are not supported
+        */
+       struct uip_ip ip;
+       u16 sport;
+       u16 dport;
+       u32 seq;
+       u32 ack;
+       u8  off;
+       u8  flg;
+       u16 win;
+       u16 csum;
+       u16 urgent;
+} __attribute__((packed));
+
+struct uip_pseudo_hdr {
+       u32 sip;
+       u32 dip;
+       u8 zero;
+       u8 proto;
+       u16 len;
+} __attribute__((packed));
+
+struct uip_dhcp {
+       struct uip_udp udp;
+       u8 msg_type;
+       u8 hardware_type;
+       u8 hardware_len;
+       u8 hops;
+       u32 id;
+       u16 time;
+       u16 flg;
+       u32 client_ip;
+       u32 your_ip;
+       u32 server_ip;
+       u32 agent_ip;
+       struct uip_eth_addr client_mac;
+       u8 pad[UIP_DHCP_MACPAD_LEN];
+       u8 server_hostname[UIP_DHCP_HOSTNAME_LEN];
+       u8 boot_filename[UIP_DHCP_FILENAME_LEN];
+       u32 magic_cookie;
+       u8 option[UIP_DHCP_OPTION_LEN];
+} __attribute__((packed));
+
+struct uip_info {
+       struct list_head udp_socket_head;
+       struct list_head tcp_socket_head;
+       pthread_mutex_t udp_socket_lock;
+       pthread_mutex_t tcp_socket_lock;
+       struct uip_eth_addr guest_mac;
+       struct uip_eth_addr host_mac;
+       pthread_cond_t buf_free_cond;
+       pthread_cond_t buf_used_cond;
+       struct list_head buf_head;
+       pthread_mutex_t buf_lock;
+       pthread_t udp_thread;
+       int udp_epollfd;
+       int buf_free_nr;
+       int buf_used_nr;
+       u32 guest_ip;
+       u32 guest_netmask;
+       u32 host_ip;
+       u32 dns_ip[UIP_DHCP_MAX_DNS_SERVER_NR];
+       char *domain_name;
+       u32 buf_nr;
+};
+
+struct uip_buf {
+       struct list_head list;
+       struct uip_info *info;
+       int vnet_len;
+       int eth_len;
+       int status;
+       char *vnet;
+       char *eth;
+       int id;
+};
+
+struct uip_udp_socket {
+       struct sockaddr_in addr;
+       struct list_head list;
+       pthread_mutex_t *lock;
+       u32 dport, sport;
+       u32 dip, sip;
+       int fd;
+};
+
+struct uip_tcp_socket {
+       struct sockaddr_in addr;
+       struct list_head list;
+       struct uip_info *info;
+       pthread_cond_t  cond;
+       pthread_mutex_t *lock;
+       pthread_t thread;
+       u32 dport, sport;
+       u32 guest_acked;
+       u16 window_size;
+       /*
+        * Initial Sequence Number
+        */
+       u32 isn_server;
+       u32 isn_guest;
+       u32 ack_server;
+       u32 seq_server;
+       int write_done;
+       int read_done;
+       u32 dip, sip;
+       u8 *payload;
+       int fd;
+};
+
+struct uip_tx_arg {
+       struct virtio_net_hdr *vnet;
+       struct uip_info *info;
+       struct uip_eth *eth;
+       int vnet_len;
+       int eth_len;
+};
+
+static inline u16 uip_ip_hdrlen(struct uip_ip *ip)
+{
+       return (ip->vhl & 0x0f) * 4;
+}
+
+static inline u16 uip_ip_len(struct uip_ip *ip)
+{
+       return htons(ip->len);
+}
+
+static inline u16 uip_udp_hdrlen(struct uip_udp *udp)
+{
+       return 8;
+}
+
+static inline u16 uip_udp_len(struct uip_udp *udp)
+{
+       return ntohs(udp->len);
+}
+
+static inline u16 uip_tcp_hdrlen(struct uip_tcp *tcp)
+{
+       return (tcp->off >> 4) * 4;
+}
+
+static inline u16 uip_tcp_len(struct uip_tcp *tcp)
+{
+       struct uip_ip *ip;
+
+       ip = &tcp->ip;
+
+       return uip_ip_len(ip) - uip_ip_hdrlen(ip);
+}
+
+static inline u16 uip_tcp_payloadlen(struct uip_tcp *tcp)
+{
+       return uip_tcp_len(tcp) - uip_tcp_hdrlen(tcp);
+}
+
+static inline u8 *uip_tcp_payload(struct uip_tcp *tcp)
+{
+       return (u8 *)&tcp->sport + uip_tcp_hdrlen(tcp);
+}
+
+static inline bool uip_tcp_is_syn(struct uip_tcp *tcp)
+{
+       return (tcp->flg & UIP_TCP_FLAG_SYN) != 0;
+}
+
+static inline bool uip_tcp_is_fin(struct uip_tcp *tcp)
+{
+       return (tcp->flg & UIP_TCP_FLAG_FIN) != 0;
+}
+
+static inline u32 uip_tcp_isn(struct uip_tcp *tcp)
+{
+       return ntohl(tcp->seq);
+}
+
+static inline u32 uip_tcp_isn_alloc(void)
+{
+       /*
+        * FIXME: should increase every 4ms
+        */
+       return 10000000;
+}
+
+static inline u16 uip_eth_hdrlen(struct uip_eth *eth)
+{
+       return sizeof(*eth);
+}
+
+int uip_tx(struct iovec *iov, u16 out, struct uip_info *info);
+int uip_rx(struct iovec *iov, u16 in, struct uip_info *info);
+int uip_init(struct uip_info *info);
+
+int uip_tx_do_ipv4_udp_dhcp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4_icmp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4_tcp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4_udp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4(struct uip_tx_arg *arg);
+int uip_tx_do_arp(struct uip_tx_arg *arg);
+
+u16 uip_csum_icmp(struct uip_icmp *icmp);
+u16 uip_csum_udp(struct uip_udp *udp);
+u16 uip_csum_tcp(struct uip_tcp *tcp);
+u16 uip_csum_ip(struct uip_ip *ip);
+
+struct uip_buf *uip_buf_set_used(struct uip_info *info, struct uip_buf *buf);
+struct uip_buf *uip_buf_set_free(struct uip_info *info, struct uip_buf *buf);
+struct uip_buf *uip_buf_get_used(struct uip_info *info);
+struct uip_buf *uip_buf_get_free(struct uip_info *info);
+struct uip_buf *uip_buf_clone(struct uip_tx_arg *arg);
+
+int uip_udp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8 *payload, int payload_len);
+bool uip_udp_is_dhcp(struct uip_udp *udp);
+
+int uip_dhcp_get_dns(struct uip_info *info);
+#endif /* KVM__UIP_H */
diff --git a/tools/kvm/include/kvm/util.h b/tools/kvm/include/kvm/util.h

new file mode 100644 (file)

index 0000000..0df9f0d
--- /dev/null
+++ b/tools/kvm/include/kvm/util.h
@@ -0,0 +1,97 @@
+#include <linux/stringify.h>
+
+#ifndef KVM__UTIL_H
+#define KVM__UTIL_H
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+/*
+ * Some bits are stolen from perf tool :)
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <linux/types.h>
+
+#ifdef __GNUC__
+#define NORETURN __attribute__((__noreturn__))
+#else
+#define NORETURN
+#ifndef __attribute__
+#define __attribute__(x)
+#endif
+#endif
+
+extern bool do_debug_print;
+
+#define PROT_RW (PROT_READ|PROT_WRITE)
+#define MAP_ANON_NORESERVE (MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE)
+
+extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2)));
+extern void die_perror(const char *s) NORETURN;
+extern int pr_err(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void pr_warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void pr_info(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN);
+
+#define pr_debug(fmt, ...)                                             \
+       do {                                                            \
+               if (do_debug_print)                                     \
+                       pr_info("(%s) %s:%d: " fmt, __FILE__,           \
+                               __func__, __LINE__, ##__VA_ARGS__);     \
+       } while (0)
+
+
+#define BUILD_BUG_ON(condition)        ((void)sizeof(char[1 - 2*!!(condition)]))
+
+#ifndef BUG_ON_HANDLER
+# define BUG_ON_HANDLER(condition)                                     \
+       do {                                                            \
+               if ((condition)) {                                      \
+                       pr_err("BUG at %s:%d", __FILE__, __LINE__);     \
+                       raise(SIGABRT);                                 \
+               }                                                       \
+       } while (0)
+#endif
+
+#define BUG_ON(condition)      BUG_ON_HANDLER((condition))
+
+#define DIE_IF(cnd)                                            \
+do {                                                           \
+       if (cnd)                                                \
+       die(" at (" __FILE__ ":" __stringify(__LINE__) "): "    \
+               __stringify(cnd) "\n");                         \
+} while (0)
+
+#define WARN_ON(condition) ({                                  \
+       int __ret_warn_on = !!(condition);                      \
+       if (__ret_warn_on)                                      \
+               pr_warning("(%s) %s:%d: failed condition: %s",  \
+                               __FILE__, __func__, __LINE__,   \
+                               __stringify(condition));        \
+       __ret_warn_on;                                          \
+})
+
+#define MSECS_TO_USECS(s) ((s) * 1000)
+
+/* Millisecond sleep */
+static inline void msleep(unsigned int msecs)
+{
+       usleep(MSECS_TO_USECS(msecs));
+}
+
+struct kvm;
+void *mmap_hugetlbfs(struct kvm *kvm, const char *htlbfs_path, u64 size);
+void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 size);
+
+#endif /* KVM__UTIL_H */
diff --git a/tools/kvm/include/kvm/vesa.h b/tools/kvm/include/kvm/vesa.h

new file mode 100644 (file)

index 0000000..ac041d9
--- /dev/null
+++ b/tools/kvm/include/kvm/vesa.h
@@ -0,0 +1,18 @@
+#ifndef KVM__VESA_H
+#define KVM__VESA_H
+
+#include <linux/types.h>
+
+#define VESA_WIDTH     640
+#define VESA_HEIGHT    480
+
+#define VESA_MEM_ADDR  0xd0000000
+#define VESA_MEM_SIZE  (4*VESA_WIDTH*VESA_HEIGHT)
+#define VESA_BPP       32
+
+struct kvm;
+struct biosregs;
+
+struct framebuffer *vesa__init(struct kvm *self);
+
+#endif
diff --git a/tools/kvm/include/kvm/virtio-9p.h b/tools/kvm/include/kvm/virtio-9p.h

new file mode 100644 (file)

index 0000000..cb590d1
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-9p.h
@@ -0,0 +1,73 @@
+#ifndef KVM__VIRTIO_9P_H
+#define KVM__VIRTIO_9P_H
+#include "kvm/virtio.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+
+#include <sys/types.h>
+#include <dirent.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+
+#define NUM_VIRT_QUEUES                1
+#define VIRTQUEUE_NUM          128
+#define        VIRTIO_9P_DEFAULT_TAG   "kvm_9p"
+#define VIRTIO_9P_HDR_LEN      (sizeof(u32)+sizeof(u8)+sizeof(u16))
+#define VIRTIO_9P_VERSION_DOTL "9P2000.L"
+#define MAX_TAG_LEN            32
+
+struct p9_msg {
+       u32                     size;
+       u8                      cmd;
+       u16                     tag;
+       u8                      msg[0];
+} __attribute__((packed));
+
+struct p9_fid {
+       u32                     fid;
+       u32                     uid;
+       char                    abs_path[PATH_MAX];
+       char                    *path;
+       DIR                     *dir;
+       int                     fd;
+       struct rb_node          node;
+};
+
+struct p9_dev_job {
+       struct virt_queue       *vq;
+       struct p9_dev           *p9dev;
+       struct thread_pool__job job_id;
+};
+
+struct p9_dev {
+       struct list_head        list;
+       struct virtio_device    vdev;
+       struct rb_root          fids;
+
+       struct virtio_9p_config *config;
+       u32                     features;
+
+       /* virtio queue */
+       struct virt_queue       vqs[NUM_VIRT_QUEUES];
+       struct p9_dev_job       jobs[NUM_VIRT_QUEUES];
+       char                    root_dir[PATH_MAX];
+};
+
+struct p9_pdu {
+       u32                     queue_head;
+       size_t                  read_offset;
+       size_t                  write_offset;
+       u16                     out_iov_cnt;
+       u16                     in_iov_cnt;
+       struct iovec            in_iov[VIRTQUEUE_NUM];
+       struct iovec            out_iov[VIRTQUEUE_NUM];
+};
+
+struct kvm;
+
+int virtio_9p__register(struct kvm *kvm, const char *root, const char *tag_name);
+int virtio_9p__init(struct kvm *kvm);
+int virtio_p9_pdu_readf(struct p9_pdu *pdu, const char *fmt, ...);
+int virtio_p9_pdu_writef(struct p9_pdu *pdu, const char *fmt, ...);
+
+#endif
diff --git a/tools/kvm/include/kvm/virtio-balloon.h b/tools/kvm/include/kvm/virtio-balloon.h

new file mode 100644 (file)

index 0000000..eb49fd4
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-balloon.h
@@ -0,0 +1,8 @@
+#ifndef KVM__BLN_VIRTIO_H
+#define KVM__BLN_VIRTIO_H
+
+struct kvm;
+
+void virtio_bln__init(struct kvm *kvm);
+
+#endif /* KVM__BLN_VIRTIO_H */
diff --git a/tools/kvm/include/kvm/virtio-blk.h b/tools/kvm/include/kvm/virtio-blk.h

new file mode 100644 (file)

index 0000000..12e59b6
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-blk.h
@@ -0,0 +1,12 @@
+#ifndef KVM__BLK_VIRTIO_H
+#define KVM__BLK_VIRTIO_H
+
+#include "kvm/disk-image.h"
+
+struct kvm;
+
+int virtio_blk__init(struct kvm *kvm);
+int virtio_blk__exit(struct kvm *kvm);
+void virtio_blk_complete(void *param, long len);
+
+#endif /* KVM__BLK_VIRTIO_H */
diff --git a/tools/kvm/include/kvm/virtio-console.h b/tools/kvm/include/kvm/virtio-console.h

new file mode 100644 (file)

index 0000000..50d8653
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-console.h
@@ -0,0 +1,9 @@
+#ifndef KVM__CONSOLE_VIRTIO_H
+#define KVM__CONSOLE_VIRTIO_H
+
+struct kvm;
+
+void virtio_console__init(struct kvm *kvm);
+void virtio_console__inject_interrupt(struct kvm *kvm);
+
+#endif /* KVM__CONSOLE_VIRTIO_H */
diff --git a/tools/kvm/include/kvm/virtio-mmio.h b/tools/kvm/include/kvm/virtio-mmio.h

new file mode 100644 (file)

index 0000000..e0ede3c
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-mmio.h
@@ -0,0 +1,58 @@
+#ifndef KVM__VIRTIO_MMIO_H
+#define KVM__VIRTIO_MMIO_H
+
+#include <linux/types.h>
+#include <linux/virtio_mmio.h>
+
+#define VIRTIO_MMIO_MAX_VQ     3
+#define VIRTIO_MMIO_MAX_CONFIG 1
+#define VIRTIO_MMIO_IO_SIZE    0x200
+
+struct kvm;
+
+struct virtio_mmio_ioevent_param {
+       struct virtio_device    *vdev;
+       u32                     vq;
+};
+
+struct virtio_mmio_hdr {
+       char    magic[4];
+       u32     version;
+       u32     device_id;
+       u32     vendor_id;
+       u32     host_features;
+       u32     host_features_sel;
+       u32     reserved_1[2];
+       u32     guest_features;
+       u32     guest_features_sel;
+       u32     guest_page_size;
+       u32     reserved_2;
+       u32     queue_sel;
+       u32     queue_num_max;
+       u32     queue_num;
+       u32     queue_align;
+       u32     queue_pfn;
+       u32     reserved_3[3];
+       u32     queue_notify;
+       u32     reserved_4[3];
+       u32     interrupt_state;
+       u32     interrupt_ack;
+       u32     reserved_5[2];
+       u32     status;
+} __attribute__((packed));
+
+struct virtio_mmio {
+       u32                     addr;
+       void                    *dev;
+       struct kvm              *kvm;
+       u8                      irq;
+       struct virtio_mmio_hdr  hdr;
+       struct virtio_mmio_ioevent_param ioeventfds[VIRTIO_MMIO_MAX_VQ];
+};
+
+int virtio_mmio_signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq);
+int virtio_mmio_signal_config(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_mmio_exit(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_mmio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+                     int device_id, int subsys_id, int class);
+#endif
diff --git a/tools/kvm/include/kvm/virtio-net.h b/tools/kvm/include/kvm/virtio-net.h

new file mode 100644 (file)

index 0000000..737676e
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-net.h
@@ -0,0 +1,26 @@
+#ifndef KVM__VIRTIO_NET_H
+#define KVM__VIRTIO_NET_H
+
+struct kvm;
+
+struct virtio_net_params {
+       const char *guest_ip;
+       const char *host_ip;
+       const char *script;
+       const char *trans;
+       char guest_mac[6];
+       char host_mac[6];
+       struct kvm *kvm;
+       int mode;
+       int vhost;
+       int fd;
+};
+
+void virtio_net__init(const struct virtio_net_params *params);
+
+enum {
+       NET_MODE_USER,
+       NET_MODE_TAP
+};
+
+#endif /* KVM__VIRTIO_NET_H */
diff --git a/tools/kvm/include/kvm/virtio-pci-dev.h b/tools/kvm/include/kvm/virtio-pci-dev.h

new file mode 100644 (file)

index 0000000..48ae018
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-pci-dev.h
@@ -0,0 +1,38 @@
+#ifndef VIRTIO_PCI_DEV_H_
+#define VIRTIO_PCI_DEV_H_
+
+#include <linux/virtio_ids.h>
+
+/*
+ * Virtio PCI device constants and resources
+ * they do use (such as irqs and pins).
+ */
+
+#define PCI_DEVICE_ID_VIRTIO_NET               0x1000
+#define PCI_DEVICE_ID_VIRTIO_BLK               0x1001
+#define PCI_DEVICE_ID_VIRTIO_CONSOLE           0x1003
+#define PCI_DEVICE_ID_VIRTIO_RNG               0x1004
+#define PCI_DEVICE_ID_VIRTIO_BLN               0x1005
+#define PCI_DEVICE_ID_VIRTIO_SCSI              0x1008
+#define PCI_DEVICE_ID_VIRTIO_9P                        0x1009
+#define PCI_DEVICE_ID_VESA                     0x2000
+#define PCI_DEVICE_ID_PCI_SHMEM                        0x0001
+
+#define PCI_VENDOR_ID_REDHAT_QUMRANET          0x1af4
+#define PCI_VENDOR_ID_PCI_SHMEM                        0x0001
+#define PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET        0x1af4
+
+#define PCI_SUBSYSTEM_ID_VESA                  0x0004
+#define PCI_SUBSYSTEM_ID_PCI_SHMEM             0x0001
+
+#define PCI_CLASS_BLK                          0x018000
+#define PCI_CLASS_NET                          0x020000
+#define PCI_CLASS_CONSOLE                      0x078000
+/*
+ * 0xFF Device does not fit in any defined classes
+ */
+#define PCI_CLASS_RNG                          0xff0000
+#define PCI_CLASS_BLN                          0xff0000
+#define PCI_CLASS_9P                           0xff0000
+
+#endif /* VIRTIO_PCI_DEV_H_ */
diff --git a/tools/kvm/include/kvm/virtio-pci.h b/tools/kvm/include/kvm/virtio-pci.h

new file mode 100644 (file)

index 0000000..44130e0
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-pci.h
@@ -0,0 +1,49 @@
+#ifndef KVM__VIRTIO_PCI_H
+#define KVM__VIRTIO_PCI_H
+
+#include "kvm/pci.h"
+
+#include <linux/types.h>
+
+#define VIRTIO_PCI_MAX_VQ      3
+#define VIRTIO_PCI_MAX_CONFIG  1
+
+struct kvm;
+
+struct virtio_pci_ioevent_param {
+       struct virtio_device    *vdev;
+       u32                     vq;
+};
+
+#define VIRTIO_PCI_F_SIGNAL_MSI (1 << 0)
+
+struct virtio_pci {
+       struct pci_device_header pci_hdr;
+       void                    *dev;
+
+       u16                     base_addr;
+       u8                      status;
+       u8                      isr;
+       u32                     features;
+
+       /* MSI-X */
+       u16                     config_vector;
+       u32                     config_gsi;
+       u32                     vq_vector[VIRTIO_PCI_MAX_VQ];
+       u32                     gsis[VIRTIO_PCI_MAX_VQ];
+       u32                     msix_io_block;
+       u64                     msix_pba;
+       struct msix_table       msix_table[VIRTIO_PCI_MAX_VQ + VIRTIO_PCI_MAX_CONFIG];
+
+       /* virtio queue */
+       u16                     queue_selector;
+       struct virtio_pci_ioevent_param ioeventfds[VIRTIO_PCI_MAX_VQ];
+};
+
+int virtio_pci__signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq);
+int virtio_pci__signal_config(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_pci__exit(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_pci__init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+                    int device_id, int subsys_id, int class);
+
+#endif
diff --git a/tools/kvm/include/kvm/virtio-rng.h b/tools/kvm/include/kvm/virtio-rng.h

new file mode 100644 (file)

index 0000000..b585b37
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-rng.h
@@ -0,0 +1,9 @@
+#ifndef KVM__RNG_VIRTIO_H
+#define KVM__RNG_VIRTIO_H
+
+struct kvm;
+
+int virtio_rng__init(struct kvm *kvm);
+int virtio_rng__exit(struct kvm *kvm);
+
+#endif /* KVM__RNG_VIRTIO_H */
diff --git a/tools/kvm/include/kvm/virtio-scsi.h b/tools/kvm/include/kvm/virtio-scsi.h

new file mode 100644 (file)

index 0000000..a780d7e
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-scsi.h
@@ -0,0 +1,26 @@
+#ifndef KVM__SCSI_VIRTIO_H
+#define KVM__SCSI_VIRTIO_H
+
+#include "kvm/disk-image.h"
+
+struct kvm;
+
+int virtio_scsi_init(struct kvm *kvm);
+int virtio_scsi_exit(struct kvm *kvm);
+
+/*----------------------------------------------------*/
+/* TODO: Remove this when tcm_vhost goes upstream */
+#define TRANSPORT_IQN_LEN              224
+#define VHOST_SCSI_ABI_VERSION         0
+struct vhost_scsi_target {
+       int abi_version;
+       unsigned char vhost_wwpn[TRANSPORT_IQN_LEN];
+       unsigned short vhost_tpgt;
+};
+/* VHOST_SCSI specific defines */
+#define VHOST_SCSI_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x40, struct vhost_scsi_target)
+#define VHOST_SCSI_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x41, struct vhost_scsi_target)
+#define VHOST_SCSI_GET_ABI_VERSION _IOW(VHOST_VIRTIO, 0x42, struct vhost_scsi_target)
+/*----------------------------------------------------*/
+
+#endif /* KVM__SCSI_VIRTIO_H */
diff --git a/tools/kvm/include/kvm/virtio.h b/tools/kvm/include/kvm/virtio.h

new file mode 100644 (file)

index 0000000..5dc2544
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio.h
@@ -0,0 +1,102 @@
+#ifndef KVM__VIRTIO_H
+#define KVM__VIRTIO_H
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_pci.h>
+
+#include <linux/types.h>
+#include <sys/uio.h>
+
+#include "kvm/kvm.h"
+
+#define VIRTIO_IRQ_LOW         0
+#define VIRTIO_IRQ_HIGH                1
+
+#define VIRTIO_PCI_O_CONFIG    0
+#define VIRTIO_PCI_O_MSIX      1
+
+struct virt_queue {
+       struct vring    vring;
+       u32             pfn;
+       /* The last_avail_idx field is an index to ->ring of struct vring_avail.
+          It's where we assume the next request index is at.  */
+       u16             last_avail_idx;
+       u16             last_used_signalled;
+};
+
+static inline u16 virt_queue__pop(struct virt_queue *queue)
+{
+       return queue->vring.avail->ring[queue->last_avail_idx++ % queue->vring.num];
+}
+
+static inline struct vring_desc *virt_queue__get_desc(struct virt_queue *queue, u16 desc_ndx)
+{
+       return &queue->vring.desc[desc_ndx];
+}
+
+static inline bool virt_queue__available(struct virt_queue *vq)
+{
+       if (!vq->vring.avail)
+               return 0;
+
+       vring_avail_event(&vq->vring) = vq->last_avail_idx;
+       return vq->vring.avail->idx !=  vq->last_avail_idx;
+}
+
+/*
+ * Warning: on 32-bit hosts, shifting pfn left may cause a truncation of pfn values
+ * higher than 4GB - thus, pointing to the wrong area in guest virtual memory space
+ * and breaking the virt queue which owns this pfn.
+ */
+static inline void *guest_pfn_to_host(struct kvm *kvm, u32 pfn)
+{
+       return guest_flat_to_host(kvm, (unsigned long)pfn << VIRTIO_PCI_QUEUE_ADDR_SHIFT);
+}
+
+
+struct vring_used_elem *virt_queue__set_used_elem(struct virt_queue *queue, u32 head, u32 len);
+
+bool virtio_queue__should_signal(struct virt_queue *vq);
+u16 virt_queue__get_iov(struct virt_queue *vq, struct iovec iov[],
+                       u16 *out, u16 *in, struct kvm *kvm);
+u16 virt_queue__get_head_iov(struct virt_queue *vq, struct iovec iov[],
+                            u16 *out, u16 *in, u16 head, struct kvm *kvm);
+u16 virt_queue__get_inout_iov(struct kvm *kvm, struct virt_queue *queue,
+                             struct iovec in_iov[], struct iovec out_iov[],
+                             u16 *in, u16 *out);
+int virtio__get_dev_specific_field(int offset, bool msix, u32 *config_off);
+
+enum virtio_trans {
+       VIRTIO_PCI,
+       VIRTIO_MMIO,
+};
+
+struct virtio_device {
+       bool                    use_vhost;
+       void                    *virtio;
+       struct virtio_ops       *ops;
+};
+
+struct virtio_ops {
+       u8 *(*get_config)(struct kvm *kvm, void *dev);
+       u32 (*get_host_features)(struct kvm *kvm, void *dev);
+       void (*set_guest_features)(struct kvm *kvm, void *dev, u32 features);
+       int (*init_vq)(struct kvm *kvm, void *dev, u32 vq, u32 pfn);
+       int (*notify_vq)(struct kvm *kvm, void *dev, u32 vq);
+       int (*get_pfn_vq)(struct kvm *kvm, void *dev, u32 vq);
+       int (*get_size_vq)(struct kvm *kvm, void *dev, u32 vq);
+       int (*set_size_vq)(struct kvm *kvm, void *dev, u32 vq, int size);
+       void (*notify_vq_gsi)(struct kvm *kvm, void *dev, u32 vq, u32 gsi);
+       void (*notify_vq_eventfd)(struct kvm *kvm, void *dev, u32 vq, u32 efd);
+       int (*signal_vq)(struct kvm *kvm, struct virtio_device *vdev, u32 queueid);
+       int (*signal_config)(struct kvm *kvm, struct virtio_device *vdev);
+       int (*init)(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+                   int device_id, int subsys_id, int class);
+       int (*exit)(struct kvm *kvm, struct virtio_device *vdev);
+};
+
+int virtio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+               struct virtio_ops *ops, enum virtio_trans trans,
+               int device_id, int subsys_id, int class);
+int virtio_compat_add_message(const char *device, const char *config);
+#endif /* KVM__VIRTIO_H */
diff --git a/tools/kvm/include/kvm/vnc.h b/tools/kvm/include/kvm/vnc.h

new file mode 100644 (file)

index 0000000..3278c07
--- /dev/null
+++ b/tools/kvm/include/kvm/vnc.h
@@ -0,0 +1,20 @@
+#ifndef KVM__VNC_H
+#define KVM__VNC_H
+
+struct framebuffer;
+
+#ifdef CONFIG_HAS_VNCSERVER
+int vnc__init(struct framebuffer *fb);
+int vnc__exit(struct framebuffer *fb);
+#else
+static inline int vnc__init(struct framebuffer *fb)
+{
+       return 0;
+}
+static inline int vnc__exit(struct framebuffer *fb)
+{
+       return 0;
+}
+#endif
+
+#endif /* KVM__VNC_H */
diff --git a/tools/kvm/include/linux/bitops.h b/tools/kvm/include/linux/bitops.h

new file mode 100644 (file)

index 0000000..56448b7
--- /dev/null
+++ b/tools/kvm/include/linux/bitops.h
@@ -0,0 +1,33 @@
+#ifndef _KVM_LINUX_BITOPS_H_
+#define _KVM_LINUX_BITOPS_H_
+
+#include <linux/kernel.h>
+#include <linux/compiler.h>
+#include <asm/hweight.h>
+
+#define BITS_PER_LONG __WORDSIZE
+#define BITS_PER_BYTE           8
+#define BITS_TO_LONGS(nr)       DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
+
+static inline void set_bit(int nr, unsigned long *addr)
+{
+       addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG);
+}
+
+static inline void clear_bit(int nr, unsigned long *addr)
+{
+       addr[nr / BITS_PER_LONG] &= ~(1UL << (nr % BITS_PER_LONG));
+}
+
+static __always_inline int test_bit(unsigned int nr, const unsigned long *addr)
+{
+       return ((1UL << (nr % BITS_PER_LONG)) &
+               (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
+}
+
+static inline unsigned long hweight_long(unsigned long w)
+{
+       return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
+}
+
+#endif
diff --git a/tools/kvm/include/linux/byteorder.h b/tools/kvm/include/linux/byteorder.h

new file mode 100644 (file)

index 0000000..c490de8
--- /dev/null
+++ b/tools/kvm/include/linux/byteorder.h
@@ -0,0 +1,7 @@
+#ifndef __BYTE_ORDER_H__
+#define __BYTE_ORDER_H__
+
+#include <asm/byteorder.h>
+#include <linux/byteorder/generic.h>
+
+#endif
diff --git a/tools/kvm/include/linux/compiler.h b/tools/kvm/include/linux/compiler.h

new file mode 100644 (file)

index 0000000..898420b
--- /dev/null
+++ b/tools/kvm/include/linux/compiler.h
@@ -0,0 +1,20 @@
+#ifndef _PERF_LINUX_COMPILER_H_
+#define _PERF_LINUX_COMPILER_H_
+
+#ifndef __always_inline
+#define __always_inline        inline
+#endif
+#define __user
+
+#ifndef __attribute_const__
+#define __attribute_const__
+#endif
+
+#define __used         __attribute__((__unused__))
+#define __packed       __attribute__((packed))
+#define __iomem
+#define __force
+#define __must_check
+#define unlikely
+
+#endif
diff --git a/tools/kvm/include/linux/kernel.h b/tools/kvm/include/linux/kernel.h

new file mode 100644 (file)

index 0000000..d2ec4a3
--- /dev/null
+++ b/tools/kvm/include/linux/kernel.h
@@ -0,0 +1,39 @@
+
+#ifndef KVM__LINUX_KERNEL_H_
+#define KVM__LINUX_KERNEL_H_
+
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+
+#define ALIGN(x,a)             __ALIGN_MASK(x,(typeof(x))(a)-1)
+#define __ALIGN_MASK(x,mask)   (((x)+(mask))&~(mask))
+
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+#endif
+
+#ifndef container_of
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:       the pointer to the member.
+ * @type:      the type of the container struct this is embedded in.
+ * @member:    the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({                     \
+       const typeof(((type *)0)->member) * __mptr = (ptr);     \
+       (type *)((char *)__mptr - offsetof(type, member)); })
+#endif
+
+#define min(x, y) ({                           \
+       typeof(x) _min1 = (x);                  \
+       typeof(y) _min2 = (y);                  \
+       (void) (&_min1 == &_min2);              \
+       _min1 < _min2 ? _min1 : _min2; })
+
+#define max(x, y) ({                           \
+       typeof(x) _max1 = (x);                  \
+       typeof(y) _max2 = (y);                  \
+       (void) (&_max1 == &_max2);              \
+       _max1 > _max2 ? _max1 : _max2; })
+
+#endif
diff --git a/tools/kvm/include/linux/module.h b/tools/kvm/include/linux/module.h

new file mode 100644 (file)

index 0000000..0e4c6a3
--- /dev/null
+++ b/tools/kvm/include/linux/module.h
@@ -0,0 +1,6 @@
+#ifndef KVM__LINUX_MODULE_H
+#define KVM__LINUX_MODULE_H
+
+#define EXPORT_SYMBOL(name)
+
+#endif
diff --git a/tools/kvm/include/linux/prefetch.h b/tools/kvm/include/linux/prefetch.h

new file mode 100644 (file)

index 0000000..62f6788
--- /dev/null
+++ b/tools/kvm/include/linux/prefetch.h
@@ -0,0 +1,6 @@
+#ifndef KVM__LINUX_PREFETCH_H
+#define KVM__LINUX_PREFETCH_H
+
+static inline void prefetch(void *a __attribute__((unused))) { }
+
+#endif
diff --git a/tools/kvm/include/linux/stddef.h b/tools/kvm/include/linux/stddef.h

new file mode 100644 (file)

index 0000000..60ea512
--- /dev/null
+++ b/tools/kvm/include/linux/stddef.h
@@ -0,0 +1,16 @@
+#ifndef _LINUX_STDDEF_H
+#define _LINUX_STDDEF_H
+
+#include <linux/compiler.h>
+
+#undef NULL
+#define NULL ((void *)0)
+
+#undef offsetof
+#ifdef __compiler_offsetof
+#define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER)
+#else
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+#endif
+
+#endif
diff --git a/tools/kvm/include/linux/types.h b/tools/kvm/include/linux/types.h

new file mode 100644 (file)

index 0000000..5e20f10
--- /dev/null
+++ b/tools/kvm/include/linux/types.h
@@ -0,0 +1,51 @@
+#ifndef LINUX_TYPES_H
+#define LINUX_TYPES_H
+
+#include <kvm/compiler.h>
+#define __SANE_USERSPACE_TYPES__       /* For PPC64, to get LL64 types */
+#include <asm/types.h>
+
+typedef __u64 u64;
+typedef __s64 s64;
+
+typedef __u32 u32;
+typedef __s32 s32;
+
+typedef __u16 u16;
+typedef __s16 s16;
+
+typedef __u8  u8;
+typedef __s8  s8;
+
+#ifdef __CHECKER__
+#define __bitwise__ __attribute__((bitwise))
+#else
+#define __bitwise__
+#endif
+#ifdef __CHECK_ENDIAN__
+#define __bitwise __bitwise__
+#else
+#define __bitwise
+#endif
+
+
+typedef __u16 __bitwise __le16;
+typedef __u16 __bitwise __be16;
+typedef __u32 __bitwise __le32;
+typedef __u32 __bitwise __be32;
+typedef __u64 __bitwise __le64;
+typedef __u64 __bitwise __be64;
+
+struct list_head {
+       struct list_head *next, *prev;
+};
+
+struct hlist_head {
+       struct hlist_node *first;
+};
+
+struct hlist_node {
+       struct hlist_node *next, **pprev;
+};
+
+#endif /* LINUX_TYPES_H */
diff --git a/tools/kvm/ioeventfd.c b/tools/kvm/ioeventfd.c

new file mode 100644 (file)

index 0000000..742b008
--- /dev/null
+++ b/tools/kvm/ioeventfd.c
@@ -0,0 +1,214 @@
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <signal.h>
+
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+#include <linux/types.h>
+
+#include "kvm/ioeventfd.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#define IOEVENTFD_MAX_EVENTS   20
+
+static struct  epoll_event events[IOEVENTFD_MAX_EVENTS];
+static int     epoll_fd, epoll_stop_fd;
+static LIST_HEAD(used_ioevents);
+static bool    ioeventfd_avail;
+
+static void *ioeventfd__thread(void *param)
+{
+       u64 tmp = 1;
+
+       for (;;) {
+               int nfds, i;
+
+               nfds = epoll_wait(epoll_fd, events, IOEVENTFD_MAX_EVENTS, -1);
+               for (i = 0; i < nfds; i++) {
+                       struct ioevent *ioevent;
+
+                       if (events[i].data.fd == epoll_stop_fd)
+                               goto done;
+
+                       ioevent = events[i].data.ptr;
+
+                       if (read(ioevent->fd, &tmp, sizeof(tmp)) < 0)
+                               die("Failed reading event");
+
+                       ioevent->fn(ioevent->fn_kvm, ioevent->fn_ptr);
+               }
+       }
+
+done:
+       tmp = write(epoll_stop_fd, &tmp, sizeof(tmp));
+
+       return NULL;
+}
+
+static int ioeventfd__start(void)
+{
+       pthread_t thread;
+
+       if (!ioeventfd_avail)
+               return -ENOSYS;
+
+       return pthread_create(&thread, NULL, ioeventfd__thread, NULL);
+}
+
+int ioeventfd__init(struct kvm *kvm)
+{
+       struct epoll_event epoll_event = {.events = EPOLLIN};
+       int r;
+
+       ioeventfd_avail = kvm__supports_extension(kvm, KVM_CAP_IOEVENTFD);
+       if (!ioeventfd_avail)
+               return 1; /* Not fatal, but let caller determine no-go. */
+
+       epoll_fd = epoll_create(IOEVENTFD_MAX_EVENTS);
+       if (epoll_fd < 0)
+               return -errno;
+
+       epoll_stop_fd = eventfd(0, 0);
+       epoll_event.data.fd = epoll_stop_fd;
+
+       r = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, epoll_stop_fd, &epoll_event);
+       if (r < 0)
+               goto cleanup;
+
+       r = ioeventfd__start();
+       if (r < 0)
+               goto cleanup;
+
+       r = 0;
+
+       return r;
+
+cleanup:
+       close(epoll_stop_fd);
+       close(epoll_fd);
+
+       return r;
+}
+
+int ioeventfd__exit(struct kvm *kvm)
+{
+       u64 tmp = 1;
+       int r;
+
+       if (!ioeventfd_avail)
+               return 0;
+
+       r = write(epoll_stop_fd, &tmp, sizeof(tmp));
+       if (r < 0)
+               return r;
+
+       r = read(epoll_stop_fd, &tmp, sizeof(tmp));
+       if (r < 0)
+               return r;
+
+       close(epoll_fd);
+       close(epoll_stop_fd);
+
+       return 0;
+}
+
+int ioeventfd__add_event(struct ioevent *ioevent, bool is_pio, bool poll_in_userspace)
+{
+       struct kvm_ioeventfd kvm_ioevent;
+       struct epoll_event epoll_event;
+       struct ioevent *new_ioevent;
+       int event, r;
+
+       if (!ioeventfd_avail)
+               return -ENOSYS;
+
+       new_ioevent = malloc(sizeof(*new_ioevent));
+       if (new_ioevent == NULL)
+               return -ENOMEM;
+
+       *new_ioevent = *ioevent;
+       event = new_ioevent->fd;
+
+       kvm_ioevent = (struct kvm_ioeventfd) {
+               .addr           = ioevent->io_addr,
+               .len            = ioevent->io_len,
+               .datamatch      = ioevent->datamatch,
+               .fd             = event,
+               .flags          = KVM_IOEVENTFD_FLAG_DATAMATCH,
+       };
+
+       if (is_pio)
+               kvm_ioevent.flags |= KVM_IOEVENTFD_FLAG_PIO;
+
+       r = ioctl(ioevent->fn_kvm->vm_fd, KVM_IOEVENTFD, &kvm_ioevent);
+       if (r) {
+               r = -errno;
+               goto cleanup;
+       }
+
+       if (!poll_in_userspace)
+               return 0;
+
+       epoll_event = (struct epoll_event) {
+               .events         = EPOLLIN,
+               .data.ptr       = new_ioevent,
+       };
+
+       r = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, event, &epoll_event);
+       if (r) {
+               r = -errno;
+               goto cleanup;
+       }
+
+       list_add_tail(&new_ioevent->list, &used_ioevents);
+
+       return 0;
+
+cleanup:
+       free(new_ioevent);
+       return r;
+}
+
+int ioeventfd__del_event(u64 addr, u64 datamatch)
+{
+       struct kvm_ioeventfd kvm_ioevent;
+       struct ioevent *ioevent;
+       u8 found = 0;
+
+       if (!ioeventfd_avail)
+               return -ENOSYS;
+
+       list_for_each_entry(ioevent, &used_ioevents, list) {
+               if (ioevent->io_addr == addr) {
+                       found = 1;
+                       break;
+               }
+       }
+
+       if (found == 0 || ioevent == NULL)
+               return -ENOENT;
+
+       kvm_ioevent = (struct kvm_ioeventfd) {
+               .addr                   = ioevent->io_addr,
+               .len                    = ioevent->io_len,
+               .datamatch              = ioevent->datamatch,
+               .flags                  = KVM_IOEVENTFD_FLAG_PIO
+                                       | KVM_IOEVENTFD_FLAG_DEASSIGN
+                                       | KVM_IOEVENTFD_FLAG_DATAMATCH,
+       };
+
+       ioctl(ioevent->fn_kvm->vm_fd, KVM_IOEVENTFD, &kvm_ioevent);
+
+       epoll_ctl(epoll_fd, EPOLL_CTL_DEL, ioevent->fd, NULL);
+
+       list_del(&ioevent->list);
+
+       close(ioevent->fd);
+       free(ioevent);
+
+       return 0;
+}
diff --git a/tools/kvm/ioport.c b/tools/kvm/ioport.c

new file mode 100644 (file)

index 0000000..662a78b
--- /dev/null
+++ b/tools/kvm/ioport.c
@@ -0,0 +1,195 @@
+#include "kvm/ioport.h"
+
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+#include "kvm/brlock.h"
+#include "kvm/rbtree-interval.h"
+#include "kvm/mutex.h"
+
+#include <linux/kvm.h> /* for KVM_EXIT_* */
+#include <linux/types.h>
+
+#include <stdbool.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define ioport_node(n) rb_entry(n, struct ioport, node)
+
+DEFINE_MUTEX(ioport_mutex);
+
+static u16                     free_io_port_idx; /* protected by ioport_mutex */
+
+static struct rb_root          ioport_tree = RB_ROOT;
+bool                           ioport_debug;
+
+static u16 ioport__find_free_port(void)
+{
+       u16 free_port;
+
+       mutex_lock(&ioport_mutex);
+       free_port = IOPORT_START + free_io_port_idx * IOPORT_SIZE;
+       free_io_port_idx++;
+       mutex_unlock(&ioport_mutex);
+
+       return free_port;
+}
+
+static struct ioport *ioport_search(struct rb_root *root, u64 addr)
+{
+       struct rb_int_node *node;
+
+       node = rb_int_search_single(root, addr);
+       if (node == NULL)
+               return NULL;
+
+       return ioport_node(node);
+}
+
+static int ioport_insert(struct rb_root *root, struct ioport *data)
+{
+       return rb_int_insert(root, &data->node);
+}
+
+static void ioport_remove(struct rb_root *root, struct ioport *data)
+{
+       rb_int_erase(root, &data->node);
+}
+
+int ioport__register(u16 port, struct ioport_operations *ops, int count, void *param)
+{
+       struct ioport *entry;
+       int r;
+
+       br_write_lock();
+       if (port == IOPORT_EMPTY)
+               port = ioport__find_free_port();
+
+       entry = ioport_search(&ioport_tree, port);
+       if (entry) {
+               pr_warning("ioport re-registered: %x", port);
+               rb_int_erase(&ioport_tree, &entry->node);
+       }
+
+       entry = malloc(sizeof(*entry));
+       if (entry == NULL)
+               return -ENOMEM;
+
+       *entry = (struct ioport) {
+               .node   = RB_INT_INIT(port, port + count),
+               .ops    = ops,
+               .priv   = param,
+       };
+
+       r = ioport_insert(&ioport_tree, entry);
+       if (r < 0) {
+               free(entry);
+               br_write_unlock();
+               return r;
+       }
+       br_write_unlock();
+
+       return port;
+}
+
+int ioport__unregister(u16 port)
+{
+       struct ioport *entry;
+       int r;
+
+       br_write_lock();
+
+       r = -ENOENT;
+       entry = ioport_search(&ioport_tree, port);
+       if (!entry)
+               goto done;
+
+       ioport_remove(&ioport_tree, entry);
+
+       free(entry);
+
+       r = 0;
+
+done:
+       br_write_unlock();
+
+       return r;
+}
+
+static void ioport__unregister_all(void)
+{
+       struct ioport *entry;
+       struct rb_node *rb;
+       struct rb_int_node *rb_node;
+
+       rb = rb_first(&ioport_tree);
+       while (rb) {
+               rb_node = rb_int(rb);
+               entry = ioport_node(rb_node);
+               ioport_remove(&ioport_tree, entry);
+               free(entry);
+               rb = rb_first(&ioport_tree);
+       }
+}
+
+static const char *to_direction(int direction)
+{
+       if (direction == KVM_EXIT_IO_IN)
+               return "IN";
+       else
+               return "OUT";
+}
+
+static void ioport_error(u16 port, void *data, int direction, int size, u32 count)
+{
+       fprintf(stderr, "IO error: %s port=%x, size=%d, count=%u\n", to_direction(direction), port, size, count);
+}
+
+bool kvm__emulate_io(struct kvm *kvm, u16 port, void *data, int direction, int size, u32 count)
+{
+       struct ioport_operations *ops;
+       bool ret = false;
+       struct ioport *entry;
+       void *ptr = data;
+
+       br_read_lock();
+       entry = ioport_search(&ioport_tree, port);
+       if (!entry)
+               goto error;
+
+       ops     = entry->ops;
+
+       while (count--) {
+               if (direction == KVM_EXIT_IO_IN && ops->io_in)
+                               ret = ops->io_in(entry, kvm, port, ptr, size);
+               else if (ops->io_out)
+                               ret = ops->io_out(entry, kvm, port, ptr, size);
+
+               ptr += size;
+       }
+
+       br_read_unlock();
+
+       if (!ret)
+               goto error;
+
+       return true;
+error:
+       br_read_unlock();
+
+       if (ioport_debug)
+               ioport_error(port, data, direction, size, count);
+
+       return !ioport_debug;
+}
+
+int ioport__init(struct kvm *kvm)
+{
+       return 0;
+}
+
+int ioport__exit(struct kvm *kvm)
+{
+       ioport__unregister_all();
+       return 0;
+}
diff --git a/tools/kvm/kvm-cmd.c b/tools/kvm/kvm-cmd.c

new file mode 100644 (file)

index 0000000..2520b08
--- /dev/null
+++ b/tools/kvm/kvm-cmd.c
@@ -0,0 +1,91 @@
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+/* user defined header files */
+#include "kvm/builtin-debug.h"
+#include "kvm/builtin-pause.h"
+#include "kvm/builtin-resume.h"
+#include "kvm/builtin-balloon.h"
+#include "kvm/builtin-list.h"
+#include "kvm/builtin-version.h"
+#include "kvm/builtin-setup.h"
+#include "kvm/builtin-stop.h"
+#include "kvm/builtin-stat.h"
+#include "kvm/builtin-help.h"
+#include "kvm/builtin-sandbox.h"
+#include "kvm/kvm-cmd.h"
+#include "kvm/builtin-run.h"
+#include "kvm/util.h"
+
+struct cmd_struct kvm_commands[] = {
+       { "pause",      kvm_cmd_pause,          kvm_pause_help,         0 },
+       { "resume",     kvm_cmd_resume,         kvm_resume_help,        0 },
+       { "debug",      kvm_cmd_debug,          kvm_debug_help,         0 },
+       { "balloon",    kvm_cmd_balloon,        kvm_balloon_help,       0 },
+       { "list",       kvm_cmd_list,           kvm_list_help,          0 },
+       { "version",    kvm_cmd_version,        NULL,                   0 },
+       { "--version",  kvm_cmd_version,        NULL,                   0 },
+       { "stop",       kvm_cmd_stop,           kvm_stop_help,          0 },
+       { "stat",       kvm_cmd_stat,           kvm_stat_help,          0 },
+       { "help",       kvm_cmd_help,           NULL,                   0 },
+       { "setup",      kvm_cmd_setup,          kvm_setup_help,         0 },
+       { "run",        kvm_cmd_run,            kvm_run_help,           0 },
+       { "sandbox",    kvm_cmd_sandbox,        kvm_run_help,           0 },
+       { NULL,         NULL,                   NULL,                   0 },
+};
+
+/*
+ * kvm_get_command: Searches the command in an array of the commands and
+ * returns a pointer to cmd_struct if a match is found.
+ *
+ * Input parameters:
+ * command: Array of possible commands. The last entry in the array must be
+ *          NULL.
+ * cmd: A string command to search in the array
+ *
+ * Return Value:
+ * NULL: If the cmd is not matched with any of the command in the command array
+ * p: Pointer to cmd_struct of the matching command
+ */
+struct cmd_struct *kvm_get_command(struct cmd_struct *command,
+               const char *cmd)
+{
+       struct cmd_struct *p = command;
+
+       while (p->cmd) {
+               if (!strcmp(p->cmd, cmd))
+                       return p;
+               p++;
+       }
+       return NULL;
+}
+
+int handle_command(struct cmd_struct *command, int argc, const char **argv)
+{
+       struct cmd_struct *p;
+       const char *prefix = NULL;
+       int ret = 0;
+
+       if (!argv || !*argv) {
+               p = kvm_get_command(command, "help");
+               BUG_ON(!p);
+               return p->fn(argc, argv, prefix);
+       }
+
+       p = kvm_get_command(command, argv[0]);
+       if (!p) {
+               p = kvm_get_command(command, "help");
+               BUG_ON(!p);
+               p->fn(0, NULL, prefix);
+               return EINVAL;
+       }
+
+       ret = p->fn(argc - 1, &argv[1], prefix);
+       if (ret < 0) {
+               if (errno == EPERM)
+                       die("Permission error - are you root?");
+       }
+
+       return ret;
+}
diff --git a/tools/kvm/kvm-cpu.c b/tools/kvm/kvm-cpu.c

new file mode 100644 (file)

index 0000000..12791dd
--- /dev/null
+++ b/tools/kvm/kvm-cpu.c
@@ -0,0 +1,175 @@
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/symbol.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+
+extern struct kvm_cpu **kvm_cpus;
+extern __thread struct kvm_cpu *current_kvm_cpu;
+
+void kvm_cpu__enable_singlestep(struct kvm_cpu *vcpu)
+{
+       struct kvm_guest_debug debug = {
+               .control        = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP,
+       };
+
+       if (ioctl(vcpu->vcpu_fd, KVM_SET_GUEST_DEBUG, &debug) < 0)
+               pr_warning("KVM_SET_GUEST_DEBUG failed");
+}
+
+void kvm_cpu__run(struct kvm_cpu *vcpu)
+{
+       int err;
+
+       if (!vcpu->is_running)
+               return;
+
+       err = ioctl(vcpu->vcpu_fd, KVM_RUN, 0);
+       if (err < 0 && (errno != EINTR && errno != EAGAIN))
+               die_perror("KVM_RUN failed");
+}
+
+static void kvm_cpu_signal_handler(int signum)
+{
+       if (signum == SIGKVMEXIT) {
+               if (current_kvm_cpu && current_kvm_cpu->is_running) {
+                       current_kvm_cpu->is_running = false;
+                       kvm__continue();
+               }
+       } else if (signum == SIGKVMPAUSE) {
+               current_kvm_cpu->paused = 1;
+       }
+}
+
+static void kvm_cpu__handle_coalesced_mmio(struct kvm_cpu *cpu)
+{
+       if (cpu->ring) {
+               while (cpu->ring->first != cpu->ring->last) {
+                       struct kvm_coalesced_mmio *m;
+                       m = &cpu->ring->coalesced_mmio[cpu->ring->first];
+                       kvm_cpu__emulate_mmio(cpu->kvm,
+                                             m->phys_addr,
+                                             m->data,
+                                             m->len,
+                                             1);
+                       cpu->ring->first = (cpu->ring->first + 1) % KVM_COALESCED_MMIO_MAX;
+               }
+       }
+}
+
+void kvm_cpu__reboot(void)
+{
+       int i;
+
+       /* The kvm_cpus array contains a null pointer in the last location */
+       for (i = 0; ; i++) {
+               if (kvm_cpus[i])
+                       pthread_kill(kvm_cpus[i]->thread, SIGKVMEXIT);
+               else
+                       break;
+       }
+}
+
+int kvm_cpu__start(struct kvm_cpu *cpu)
+{
+       sigset_t sigset;
+
+       sigemptyset(&sigset);
+       sigaddset(&sigset, SIGALRM);
+
+       pthread_sigmask(SIG_BLOCK, &sigset, NULL);
+
+       signal(SIGKVMEXIT, kvm_cpu_signal_handler);
+       signal(SIGKVMPAUSE, kvm_cpu_signal_handler);
+
+       kvm_cpu__reset_vcpu(cpu);
+
+       if (cpu->kvm->single_step)
+               kvm_cpu__enable_singlestep(cpu);
+
+       while (cpu->is_running) {
+               if (cpu->paused) {
+                       kvm__notify_paused();
+                       cpu->paused = 0;
+               }
+
+               if (cpu->needs_nmi) {
+                       kvm_cpu__arch_nmi(cpu);
+                       cpu->needs_nmi = 0;
+               }
+
+               kvm_cpu__run(cpu);
+
+               switch (cpu->kvm_run->exit_reason) {
+               case KVM_EXIT_UNKNOWN:
+                       break;
+               case KVM_EXIT_DEBUG:
+                       kvm_cpu__show_registers(cpu);
+                       kvm_cpu__show_code(cpu);
+                       break;
+               case KVM_EXIT_IO: {
+                       bool ret;
+
+                       ret = kvm_cpu__emulate_io(cpu->kvm,
+                                                 cpu->kvm_run->io.port,
+                                                 (u8 *)cpu->kvm_run +
+                                                 cpu->kvm_run->io.data_offset,
+                                                 cpu->kvm_run->io.direction,
+                                                 cpu->kvm_run->io.size,
+                                                 cpu->kvm_run->io.count);
+
+                       if (!ret)
+                               goto panic_kvm;
+                       break;
+               }
+               case KVM_EXIT_MMIO: {
+                       bool ret;
+
+                       /*
+                        * If we had MMIO exit, coalesced ring should be processed
+                        * *before* processing the exit itself
+                        */
+                       kvm_cpu__handle_coalesced_mmio(cpu);
+
+                       ret = kvm_cpu__emulate_mmio(cpu->kvm,
+                                                   cpu->kvm_run->mmio.phys_addr,
+                                                   cpu->kvm_run->mmio.data,
+                                                   cpu->kvm_run->mmio.len,
+                                                   cpu->kvm_run->mmio.is_write);
+
+                       if (!ret)
+                               goto panic_kvm;
+                       break;
+               }
+               case KVM_EXIT_INTR:
+                       if (cpu->is_running)
+                               break;
+                       goto exit_kvm;
+               case KVM_EXIT_SHUTDOWN:
+                       goto exit_kvm;
+               default: {
+                       bool ret;
+
+                       ret = kvm_cpu__handle_exit(cpu);
+                       if (!ret)
+                               goto panic_kvm;
+                       break;
+               }
+               }
+               kvm_cpu__handle_coalesced_mmio(cpu);
+       }
+
+exit_kvm:
+       return 0;
+
+panic_kvm:
+       return 1;
+}
diff --git a/tools/kvm/kvm-ipc.c b/tools/kvm/kvm-ipc.c

new file mode 100644 (file)

index 0000000..70831b8
--- /dev/null
+++ b/tools/kvm/kvm-ipc.c
@@ -0,0 +1,231 @@
+#include "kvm/kvm-ipc.h"
+#include "kvm/rwsem.h"
+#include "kvm/read-write.h"
+#include "kvm/util.h"
+
+#include <sys/epoll.h>
+#include <sys/un.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/eventfd.h>
+
+struct kvm_ipc_head {
+       u32 type;
+       u32 len;
+};
+
+#define KVM_IPC_MAX_MSGS 16
+
+static void (*msgs[KVM_IPC_MAX_MSGS])(int fd, u32 type, u32 len, u8 *msg);
+static DECLARE_RWSEM(msgs_rwlock);
+static int epoll_fd, server_fd, stop_fd;
+static pthread_t thread;
+
+int kvm_ipc__register_handler(u32 type, void (*cb)(int fd, u32 type, u32 len, u8 *msg))
+{
+       if (type >= KVM_IPC_MAX_MSGS)
+               return -ENOSPC;
+
+       down_write(&msgs_rwlock);
+       msgs[type] = cb;
+       up_write(&msgs_rwlock);
+
+       return 0;
+}
+
+int kvm_ipc__send(int fd, u32 type)
+{
+       struct kvm_ipc_head head = {.type = type, .len = 0,};
+
+       if (write_in_full(fd, &head, sizeof(head)) < 0)
+               return -1;
+
+       return 0;
+}
+
+int kvm_ipc__send_msg(int fd, u32 type, u32 len, u8 *msg)
+{
+       struct kvm_ipc_head head = {.type = type, .len = len,};
+
+       if (write_in_full(fd, &head, sizeof(head)) < 0)
+               return -1;
+
+       if (write_in_full(fd, msg, len) < 0)
+               return -1;
+
+       return 0;
+}
+
+static int kvm_ipc__handle(int fd, u32 type, u32 len, u8 *data)
+{
+       void (*cb)(int fd, u32 type, u32 len, u8 *msg);
+
+       if (type >= KVM_IPC_MAX_MSGS)
+               return -ENOSPC;
+
+       down_read(&msgs_rwlock);
+       cb = msgs[type];
+       up_read(&msgs_rwlock);
+
+       if (cb == NULL) {
+               pr_warning("No device handles type %u\n", type);
+               return -ENODEV;
+       }
+
+       cb(fd, type, len, data);
+
+       return 0;
+}
+
+static int kvm_ipc__new_conn(int fd)
+{
+       int client;
+       struct epoll_event ev;
+
+       client = accept(fd, NULL, NULL);
+       if (client < 0)
+               return -1;
+
+       ev.events = EPOLLIN | EPOLLRDHUP;
+       ev.data.fd = client;
+       if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, client, &ev) < 0) {
+               close(client);
+               return -1;
+       }
+
+       return client;
+}
+
+static void kvm_ipc__close_conn(int fd)
+{
+       epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, NULL);
+       close(fd);
+}
+
+static int kvm_ipc__receive(int fd)
+{
+       struct kvm_ipc_head head;
+       u8 *msg = NULL;
+       u32 n;
+
+       n = read(fd, &head, sizeof(head));
+       if (n != sizeof(head))
+               goto done;
+
+       msg = malloc(head.len);
+       if (msg == NULL)
+               goto done;
+
+       n = read_in_full(fd, msg, head.len);
+       if (n != head.len)
+               goto done;
+
+       kvm_ipc__handle(fd, head.type, head.len, msg);
+
+       return 0;
+
+done:
+       free(msg);
+       return -1;
+}
+
+static void *kvm_ipc__thread(void *param)
+{
+       struct epoll_event event;
+
+       for (;;) {
+               int nfds;
+
+               nfds = epoll_wait(epoll_fd, &event, 1, -1);
+               if (nfds > 0) {
+                       int fd = event.data.fd;
+
+                       if (fd == stop_fd && event.events & EPOLLIN) {
+                               break;
+                       } else if (fd == server_fd) {
+                               int client, r;
+
+                               client = kvm_ipc__new_conn(fd);
+                               /*
+                                * Handle multiple IPC cmd at a time
+                                */
+                               do {
+                                       r = kvm_ipc__receive(client);
+                               } while (r == 0);
+
+                       } else if (event.events & (EPOLLERR | EPOLLRDHUP | EPOLLHUP)) {
+                               kvm_ipc__close_conn(fd);
+                       } else {
+                               kvm_ipc__receive(fd);
+                       }
+               }
+       }
+
+       return NULL;
+}
+
+int kvm_ipc__start(int sock)
+{
+       int ret;
+       struct epoll_event ev = {0};
+
+       server_fd = sock;
+
+       epoll_fd = epoll_create(KVM_IPC_MAX_MSGS);
+       if (epoll_fd < 0) {
+               ret = epoll_fd;
+               goto err;
+       }
+
+       ev.events = EPOLLIN | EPOLLET;
+       ev.data.fd = sock;
+       if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, sock, &ev) < 0) {
+               pr_err("Failed starting IPC thread");
+               ret = -EFAULT;
+               goto err_epoll;
+       }
+
+       stop_fd = eventfd(0, 0);
+       if (stop_fd < 0) {
+               ret = stop_fd;
+               goto err_epoll;
+       }
+
+       ev.events = EPOLLIN | EPOLLET;
+       ev.data.fd = stop_fd;
+       if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, stop_fd, &ev) < 0) {
+               pr_err("Failed adding stop event to epoll");
+               ret = -EFAULT;
+               goto err_stop;
+       }
+
+       if (pthread_create(&thread, NULL, kvm_ipc__thread, NULL) != 0) {
+               pr_err("Failed starting IPC thread");
+               ret = -EFAULT;
+               goto err_stop;
+       }
+
+       return 0;
+
+err_stop:
+       close(stop_fd);
+err_epoll:
+       close(epoll_fd);
+err:
+       return ret;
+}
+
+int kvm_ipc__stop(void)
+{
+       u64 val = 1;
+       int ret;
+
+       ret = write(stop_fd, &val, sizeof(val));
+       if (ret < 0)
+               return ret;
+
+       close(server_fd);
+       close(epoll_fd);
+
+       return ret;
+}
diff --git a/tools/kvm/kvm.c b/tools/kvm/kvm.c

new file mode 100644 (file)

index 0000000..64d8b51
--- /dev/null
+++ b/tools/kvm/kvm.c
@@ -0,0 +1,593 @@
+#include "kvm/kvm.h"
+#include "kvm/read-write.h"
+#include "kvm/util.h"
+#include "kvm/strbuf.h"
+#include "kvm/mutex.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/kvm-ipc.h"
+
+#include <linux/kvm.h>
+#include <linux/err.h>
+
+#include <sys/un.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <time.h>
+#include <sys/eventfd.h>
+#include <asm/unistd.h>
+#include <dirent.h>
+
+#define DEFINE_KVM_EXIT_REASON(reason) [reason] = #reason
+
+const char *kvm_exit_reasons[] = {
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_UNKNOWN),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_EXCEPTION),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_IO),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_HYPERCALL),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_DEBUG),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_HLT),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_MMIO),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_IRQ_WINDOW_OPEN),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_SHUTDOWN),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_FAIL_ENTRY),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTR),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_SET_TPR),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_TPR_ACCESS),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_SIEIC),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_RESET),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_DCR),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_NMI),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTERNAL_ERROR),
+#ifdef CONFIG_PPC64
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_PAPR_HCALL),
+#endif
+};
+
+extern struct kvm *kvm;
+extern struct kvm_cpu **kvm_cpus;
+static int pause_event;
+static DEFINE_MUTEX(pause_lock);
+extern struct kvm_ext kvm_req_ext[];
+
+static char kvm_dir[PATH_MAX];
+
+static int set_dir(const char *fmt, va_list args)
+{
+       char tmp[PATH_MAX];
+
+       vsnprintf(tmp, sizeof(tmp), fmt, args);
+
+       mkdir(tmp, 0777);
+
+       if (!realpath(tmp, kvm_dir))
+               return -errno;
+
+       strcat(kvm_dir, "/");
+
+       return 0;
+}
+
+void kvm__set_dir(const char *fmt, ...)
+{
+       va_list args;
+
+       va_start(args, fmt);
+       set_dir(fmt, args);
+       va_end(args);
+}
+
+const char *kvm__get_dir(void)
+{
+       return kvm_dir;
+}
+
+bool kvm__supports_extension(struct kvm *kvm, unsigned int extension)
+{
+       int ret;
+
+       ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, extension);
+       if (ret < 0)
+               return false;
+
+       return ret;
+}
+
+static int kvm__check_extensions(struct kvm *kvm)
+{
+       int i;
+
+       for (i = 0; ; i++) {
+               if (!kvm_req_ext[i].name)
+                       break;
+               if (!kvm__supports_extension(kvm, kvm_req_ext[i].code)) {
+                       pr_err("Unsuppored KVM extension detected: %s",
+                               kvm_req_ext[i].name);
+                       return -i;
+               }
+       }
+
+       return 0;
+}
+
+static struct kvm *kvm__new(void)
+{
+       struct kvm *kvm = calloc(1, sizeof(*kvm));
+       if (!kvm)
+               return ERR_PTR(-ENOMEM);
+
+       kvm->sys_fd = -1;
+       kvm->vm_fd = -1;
+
+       return kvm;
+}
+
+#define KVM_SOCK_SUFFIX                ".sock"
+#define KVM_SOCK_SUFFIX_LEN    ((ssize_t)sizeof(KVM_SOCK_SUFFIX) - 1)
+
+static int kvm__create_socket(struct kvm *kvm)
+{
+       char full_name[PATH_MAX];
+       unsigned int s;
+       struct sockaddr_un local;
+       int len, r;
+
+       /* This usually 108 bytes long */
+       BUILD_BUG_ON(sizeof(local.sun_path) < 32);
+
+       if (!kvm->name)
+               return -EINVAL;
+
+       snprintf(full_name, sizeof(full_name), "%s/%s%s",
+                kvm__get_dir(), kvm->name, KVM_SOCK_SUFFIX);
+       if (access(full_name, F_OK) == 0) {
+               pr_err("Socket file %s already exist", full_name);
+               return -EEXIST;
+       }
+
+       s = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (s < 0)
+               return s;
+       local.sun_family = AF_UNIX;
+       strlcpy(local.sun_path, full_name, sizeof(local.sun_path));
+       len = strlen(local.sun_path) + sizeof(local.sun_family);
+       r = bind(s, (struct sockaddr *)&local, len);
+       if (r < 0)
+               goto fail;
+
+       r = listen(s, 5);
+       if (r < 0)
+               goto fail;
+
+       return s;
+
+fail:
+       close(s);
+       return r;
+}
+
+void kvm__remove_socket(const char *name)
+{
+       char full_name[PATH_MAX];
+
+       snprintf(full_name, sizeof(full_name), "%s/%s%s",
+                kvm__get_dir(), name, KVM_SOCK_SUFFIX);
+       unlink(full_name);
+}
+
+int kvm__get_sock_by_instance(const char *name)
+{
+       int s, len, r;
+       char sock_file[PATH_MAX];
+       struct sockaddr_un local;
+
+       snprintf(sock_file, sizeof(sock_file), "%s/%s%s",
+                kvm__get_dir(), name, KVM_SOCK_SUFFIX);
+       s = socket(AF_UNIX, SOCK_STREAM, 0);
+
+       local.sun_family = AF_UNIX;
+       strlcpy(local.sun_path, sock_file, sizeof(local.sun_path));
+       len = strlen(local.sun_path) + sizeof(local.sun_family);
+
+       r = connect(s, &local, len);
+       if (r < 0 && errno == ECONNREFUSED) {
+               /* Tell the user clean ghost socket file */
+               pr_err("\"%s\" could be a ghost socket file, please remove it",
+                               sock_file);
+               return r;
+       } else if (r < 0) {
+               return r;
+       }
+
+       return s;
+}
+
+int kvm__enumerate_instances(int (*callback)(const char *name, int fd))
+{
+       int sock;
+       DIR *dir;
+       struct dirent entry, *result;
+       int ret = 0;
+
+       dir = opendir(kvm__get_dir());
+       if (!dir)
+               return -errno;
+
+       for (;;) {
+               readdir_r(dir, &entry, &result);
+               if (result == NULL)
+                       break;
+               if (entry.d_type == DT_SOCK) {
+                       ssize_t name_len = strlen(entry.d_name);
+                       char *p;
+
+                       if (name_len <= KVM_SOCK_SUFFIX_LEN)
+                               continue;
+
+                       p = &entry.d_name[name_len - KVM_SOCK_SUFFIX_LEN];
+                       if (memcmp(KVM_SOCK_SUFFIX, p, KVM_SOCK_SUFFIX_LEN))
+                               continue;
+
+                       *p = 0;
+                       sock = kvm__get_sock_by_instance(entry.d_name);
+                       if (sock < 0)
+                               continue;
+                       ret = callback(entry.d_name, sock);
+                       close(sock);
+                       if (ret < 0)
+                               break;
+               }
+       }
+
+       closedir(dir);
+
+       return ret;
+}
+
+int kvm__exit(struct kvm *kvm)
+{
+       kvm__stop_timer(kvm);
+
+       kvm__arch_delete_ram(kvm);
+       kvm_ipc__stop();
+       kvm__remove_socket(kvm->name);
+       free(kvm->name);
+       free(kvm);
+
+       return 0;
+}
+
+/*
+ * Note: KVM_SET_USER_MEMORY_REGION assumes that we don't pass overlapping
+ * memory regions to it. Therefore, be careful if you use this function for
+ * registering memory regions for emulating hardware.
+ */
+int kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 size, void *userspace_addr)
+{
+       struct kvm_userspace_memory_region mem;
+       int ret;
+
+       mem = (struct kvm_userspace_memory_region) {
+               .slot                   = kvm->mem_slots++,
+               .guest_phys_addr        = guest_phys,
+               .memory_size            = size,
+               .userspace_addr         = (unsigned long)userspace_addr,
+       };
+
+       ret = ioctl(kvm->vm_fd, KVM_SET_USER_MEMORY_REGION, &mem);
+       if (ret < 0)
+               return -errno;
+
+       return 0;
+}
+
+int kvm__recommended_cpus(struct kvm *kvm)
+{
+       int ret;
+
+       ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_NR_VCPUS);
+       if (ret <= 0)
+               /*
+                * api.txt states that if KVM_CAP_NR_VCPUS does not exist,
+                * assume 4.
+                */
+               return 4;
+
+       return ret;
+}
+
+static void kvm__pid(int fd, u32 type, u32 len, u8 *msg)
+{
+       pid_t pid = getpid();
+       int r = 0;
+
+       if (type == KVM_IPC_PID)
+               r = write(fd, &pid, sizeof(pid));
+
+       if (r < 0)
+               pr_warning("Failed sending PID");
+}
+
+/*
+ * The following hack should be removed once 'x86: Raise the hard
+ * VCPU count limit' makes it's way into the mainline.
+ */
+#ifndef KVM_CAP_MAX_VCPUS
+#define KVM_CAP_MAX_VCPUS 66
+#endif
+
+int kvm__max_cpus(struct kvm *kvm)
+{
+       int ret;
+
+       ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_MAX_VCPUS);
+       if (ret <= 0)
+               ret = kvm__recommended_cpus(kvm);
+
+       return ret;
+}
+
+struct kvm *kvm__init(const char *kvm_dev, const char *hugetlbfs_path, u64 ram_size, const char *name)
+{
+       struct kvm *kvm;
+       int ret;
+
+       if (!kvm__arch_cpu_supports_vm()) {
+               pr_err("Your CPU does not support hardware virtualization");
+               ret = -ENOSYS;
+               goto err;
+       }
+
+       kvm = kvm__new();
+       if (IS_ERR(kvm))
+               return kvm;
+
+       kvm->sys_fd = open(kvm_dev, O_RDWR);
+       if (kvm->sys_fd < 0) {
+               if (errno == ENOENT)
+                       pr_err("'%s' not found. Please make sure your kernel has CONFIG_KVM "
+                              "enabled and that the KVM modules are loaded.", kvm_dev);
+               else if (errno == ENODEV)
+                       pr_err("'%s' KVM driver not available.\n  # (If the KVM "
+                              "module is loaded then 'dmesg' may offer further clues "
+                              "about the failure.)", kvm_dev);
+               else
+                       pr_err("Could not open %s: ", kvm_dev);
+
+               ret = -errno;
+               goto err_free;
+       }
+
+       ret = ioctl(kvm->sys_fd, KVM_GET_API_VERSION, 0);
+       if (ret != KVM_API_VERSION) {
+               pr_err("KVM_API_VERSION ioctl");
+               ret = -errno;
+               goto err_sys_fd;
+       }
+
+       kvm->vm_fd = ioctl(kvm->sys_fd, KVM_CREATE_VM, 0);
+       if (kvm->vm_fd < 0) {
+               ret = kvm->vm_fd;
+               goto err_sys_fd;
+       }
+
+       kvm->name = strdup(name);
+       if (!kvm->name) {
+               ret = -ENOMEM;
+               goto err_vm_fd;
+       }
+
+       if (kvm__check_extensions(kvm)) {
+               pr_err("A required KVM extention is not supported by OS");
+               ret = -ENOSYS;
+               goto err_vm_fd;
+       }
+
+       kvm__arch_init(kvm, hugetlbfs_path, ram_size);
+
+       ret = kvm_ipc__start(kvm__create_socket(kvm));
+       if (ret < 0) {
+               pr_err("Starting ipc failed.");
+               goto err_vm_fd;
+       }
+
+       ret = kvm_ipc__register_handler(KVM_IPC_PID, kvm__pid);
+       if (ret < 0) {
+               pr_err("Register ipc handler failed.");
+               goto err_ipc;
+       }
+
+       return kvm;
+
+err_ipc:
+       kvm_ipc__stop();
+err_vm_fd:
+       close(kvm->vm_fd);
+err_sys_fd:
+       close(kvm->sys_fd);
+err_free:
+       free(kvm);
+err:
+       return ERR_PTR(ret);
+}
+
+/* RFC 1952 */
+#define GZIP_ID1               0x1f
+#define GZIP_ID2               0x8b
+#define CPIO_MAGIC             "0707"
+/* initrd may be gzipped, or a plain cpio */
+static bool initrd_check(int fd)
+{
+       unsigned char id[4];
+
+       if (read_in_full(fd, id, ARRAY_SIZE(id)) < 0)
+               return false;
+
+       if (lseek(fd, 0, SEEK_SET) < 0)
+               die_perror("lseek");
+
+       return (id[0] == GZIP_ID1 && id[1] == GZIP_ID2) ||
+               !memcmp(id, CPIO_MAGIC, 4);
+}
+
+bool kvm__load_kernel(struct kvm *kvm, const char *kernel_filename,
+               const char *initrd_filename, const char *kernel_cmdline, u16 vidmode)
+{
+       bool ret;
+       int fd_kernel = -1, fd_initrd = -1;
+
+       fd_kernel = open(kernel_filename, O_RDONLY);
+       if (fd_kernel < 0)
+               die("Unable to open kernel %s", kernel_filename);
+
+       if (initrd_filename) {
+               fd_initrd = open(initrd_filename, O_RDONLY);
+               if (fd_initrd < 0)
+                       die("Unable to open initrd %s", initrd_filename);
+
+               if (!initrd_check(fd_initrd))
+                       die("%s is not an initrd", initrd_filename);
+       }
+
+       ret = load_bzimage(kvm, fd_kernel, fd_initrd, kernel_cmdline, vidmode);
+
+       if (ret)
+               goto found_kernel;
+
+       pr_warning("%s is not a bzImage. Trying to load it as a flat binary...", kernel_filename);
+
+       ret = load_flat_binary(kvm, fd_kernel, fd_initrd, kernel_cmdline);
+
+       if (ret)
+               goto found_kernel;
+
+       if (initrd_filename)
+               close(fd_initrd);
+       close(fd_kernel);
+
+       die("%s is not a valid bzImage or flat binary", kernel_filename);
+
+found_kernel:
+       if (initrd_filename)
+               close(fd_initrd);
+       close(fd_kernel);
+
+       return ret;
+}
+
+#define TIMER_INTERVAL_NS 1000000      /* 1 msec */
+
+/*
+ * This function sets up a timer that's used to inject interrupts from the
+ * userspace hypervisor into the guest at periodical intervals. Please note
+ * that clock interrupt, for example, is not handled here.
+ */
+void kvm__start_timer(struct kvm *kvm)
+{
+       struct itimerspec its;
+       struct sigevent sev;
+
+       memset(&sev, 0, sizeof(struct sigevent));
+       sev.sigev_value.sival_int       = 0;
+       sev.sigev_notify                = SIGEV_THREAD_ID;
+       sev.sigev_signo                 = SIGALRM;
+       sev._sigev_un._tid              = syscall(__NR_gettid);
+
+       if (timer_create(CLOCK_REALTIME, &sev, &kvm->timerid) < 0)
+               die("timer_create()");
+
+       its.it_value.tv_sec             = TIMER_INTERVAL_NS / 1000000000;
+       its.it_value.tv_nsec            = TIMER_INTERVAL_NS % 1000000000;
+       its.it_interval.tv_sec          = its.it_value.tv_sec;
+       its.it_interval.tv_nsec         = its.it_value.tv_nsec;
+
+       if (timer_settime(kvm->timerid, 0, &its, NULL) < 0)
+               die("timer_settime()");
+}
+
+void kvm__stop_timer(struct kvm *kvm)
+{
+       if (kvm->timerid)
+               if (timer_delete(kvm->timerid) < 0)
+                       die("timer_delete()");
+
+       kvm->timerid = 0;
+}
+
+void kvm__dump_mem(struct kvm *kvm, unsigned long addr, unsigned long size)
+{
+       unsigned char *p;
+       unsigned long n;
+
+       size &= ~7; /* mod 8 */
+       if (!size)
+               return;
+
+       p = guest_flat_to_host(kvm, addr);
+
+       for (n = 0; n < size; n += 8) {
+               if (!host_ptr_in_ram(kvm, p + n))
+                       break;
+
+               printf("  0x%08lx: %02x %02x %02x %02x  %02x %02x %02x %02x\n",
+                       addr + n, p[n + 0], p[n + 1], p[n + 2], p[n + 3],
+                                 p[n + 4], p[n + 5], p[n + 6], p[n + 7]);
+       }
+}
+
+void kvm__pause(void)
+{
+       int i, paused_vcpus = 0;
+
+       /* Check if the guest is running */
+       if (!kvm_cpus[0] || kvm_cpus[0]->thread == 0)
+               return;
+
+       mutex_lock(&pause_lock);
+
+       pause_event = eventfd(0, 0);
+       if (pause_event < 0)
+               die("Failed creating pause notification event");
+       for (i = 0; i < kvm->nrcpus; i++)
+               pthread_kill(kvm_cpus[i]->thread, SIGKVMPAUSE);
+
+       while (paused_vcpus < kvm->nrcpus) {
+               u64 cur_read;
+
+               if (read(pause_event, &cur_read, sizeof(cur_read)) < 0)
+                       die("Failed reading pause event");
+               paused_vcpus += cur_read;
+       }
+       close(pause_event);
+}
+
+void kvm__continue(void)
+{
+       /* Check if the guest is running */
+       if (!kvm_cpus[0] || kvm_cpus[0]->thread == 0)
+               return;
+
+       mutex_unlock(&pause_lock);
+}
+
+void kvm__notify_paused(void)
+{
+       u64 p = 1;
+
+       if (write(pause_event, &p, sizeof(p)) < 0)
+               die("Failed notifying of paused VCPU.");
+
+       mutex_lock(&pause_lock);
+       mutex_unlock(&pause_lock);
+}
diff --git a/tools/kvm/main.c b/tools/kvm/main.c

new file mode 100644 (file)

index 0000000..05bc82c
--- /dev/null
+++ b/tools/kvm/main.c
@@ -0,0 +1,19 @@
+#include "kvm/kvm.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+
+/* user defined header files */
+#include <kvm/kvm-cmd.h>
+
+static int handle_kvm_command(int argc, char **argv)
+{
+       return handle_command(kvm_commands, argc, (const char **) &argv[0]);
+}
+
+int main(int argc, char *argv[])
+{
+       kvm__set_dir("%s/%s", HOME_DIR, KVM_PID_FILE_PATH);
+
+       return handle_kvm_command(argc - 1, &argv[1]);
+}
diff --git a/tools/kvm/mmio.c b/tools/kvm/mmio.c

new file mode 100644 (file)

index 0000000..dd28ef3
--- /dev/null
+++ b/tools/kvm/mmio.c
@@ -0,0 +1,140 @@
+#include "kvm/kvm.h"
+#include "kvm/rbtree-interval.h"
+#include "kvm/brlock.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <sys/ioctl.h>
+#include <linux/kvm.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/err.h>
+#include <errno.h>
+
+#define mmio_node(n) rb_entry(n, struct mmio_mapping, node)
+
+struct mmio_mapping {
+       struct rb_int_node      node;
+       void                    (*mmio_fn)(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr);
+       void                    *ptr;
+};
+
+static struct rb_root mmio_tree = RB_ROOT;
+bool mmio_debug = false;
+
+static struct mmio_mapping *mmio_search(struct rb_root *root, u64 addr, u64 len)
+{
+       struct rb_int_node *node;
+
+       node = rb_int_search_range(root, addr, addr + len);
+       if (node == NULL)
+               return NULL;
+
+       return mmio_node(node);
+}
+
+/* Find lowest match, Check for overlap */
+static struct mmio_mapping *mmio_search_single(struct rb_root *root, u64 addr)
+{
+       struct rb_int_node *node;
+
+       node = rb_int_search_single(root, addr);
+       if (node == NULL)
+               return NULL;
+
+       return mmio_node(node);
+}
+
+static int mmio_insert(struct rb_root *root, struct mmio_mapping *data)
+{
+       return rb_int_insert(root, &data->node);
+}
+
+static const char *to_direction(u8 is_write)
+{
+       if (is_write)
+               return "write";
+
+       return "read";
+}
+
+int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool coalesce,
+                       void (*mmio_fn)(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr),
+                       void *ptr)
+{
+       struct mmio_mapping *mmio;
+       struct kvm_coalesced_mmio_zone zone;
+       int ret;
+
+       mmio = malloc(sizeof(*mmio));
+       if (mmio == NULL)
+               return -ENOMEM;
+
+       *mmio = (struct mmio_mapping) {
+               .node = RB_INT_INIT(phys_addr, phys_addr + phys_addr_len),
+               .mmio_fn = mmio_fn,
+               .ptr    = ptr,
+       };
+
+       if (coalesce) {
+               zone = (struct kvm_coalesced_mmio_zone) {
+                       .addr   = phys_addr,
+                       .size   = phys_addr_len,
+               };
+               ret = ioctl(kvm->vm_fd, KVM_REGISTER_COALESCED_MMIO, &zone);
+               if (ret < 0) {
+                       free(mmio);
+                       return -errno;
+               }
+       }
+       br_write_lock();
+       ret = mmio_insert(&mmio_tree, mmio);
+       br_write_unlock();
+
+       return ret;
+}
+
+bool kvm__deregister_mmio(struct kvm *kvm, u64 phys_addr)
+{
+       struct mmio_mapping *mmio;
+       struct kvm_coalesced_mmio_zone zone;
+
+       br_write_lock();
+       mmio = mmio_search_single(&mmio_tree, phys_addr);
+       if (mmio == NULL) {
+               br_write_unlock();
+               return false;
+       }
+
+       zone = (struct kvm_coalesced_mmio_zone) {
+               .addr   = phys_addr,
+               .size   = 1,
+       };
+       ioctl(kvm->vm_fd, KVM_UNREGISTER_COALESCED_MMIO, &zone);
+
+       rb_int_erase(&mmio_tree, &mmio->node);
+       br_write_unlock();
+
+       free(mmio);
+       return true;
+}
+
+bool kvm__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+       struct mmio_mapping *mmio;
+
+       br_read_lock();
+       mmio = mmio_search(&mmio_tree, phys_addr, len);
+
+       if (mmio)
+               mmio->mmio_fn(phys_addr, data, len, is_write, mmio->ptr);
+       else {
+               if (mmio_debug)
+                       fprintf(stderr, "Warning: Ignoring MMIO %s at %016llx (length %u)\n",
+                               to_direction(is_write), phys_addr, len);
+       }
+       br_read_unlock();
+
+       return true;
+}
diff --git a/tools/kvm/net/uip/arp.c b/tools/kvm/net/uip/arp.c

new file mode 100644 (file)

index 0000000..98423da
--- /dev/null
+++ b/tools/kvm/net/uip/arp.c
@@ -0,0 +1,30 @@
+#include "kvm/uip.h"
+
+int uip_tx_do_arp(struct uip_tx_arg *arg)
+{
+       struct uip_arp *arp, *arp2;
+       struct uip_info *info;
+       struct uip_buf *buf;
+
+       info = arg->info;
+       buf = uip_buf_clone(arg);
+
+       arp      = (struct uip_arp *)(arg->eth);
+       arp2     = (struct uip_arp *)(buf->eth);
+
+       /*
+        * ARP replay code: 2
+        */
+       arp2->op   = htons(0x2);
+       arp2->dmac = arp->smac;
+       arp2->dip  = arp->sip;
+
+       if (arp->dip == htonl(info->host_ip)) {
+               arp2->smac = info->host_mac;
+               arp2->sip = htonl(info->host_ip);
+
+               uip_buf_set_used(info, buf);
+       }
+
+       return 0;
+}
diff --git a/tools/kvm/net/uip/buf.c b/tools/kvm/net/uip/buf.c

new file mode 100644 (file)

index 0000000..5e564a9
--- /dev/null
+++ b/tools/kvm/net/uip/buf.c
@@ -0,0 +1,114 @@
+#include "kvm/uip.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+struct uip_buf *uip_buf_get_used(struct uip_info *info)
+{
+       struct uip_buf *buf;
+       bool found = false;
+
+       mutex_lock(&info->buf_lock);
+
+       while (!(info->buf_used_nr > 0))
+               pthread_cond_wait(&info->buf_used_cond, &info->buf_lock);
+
+       list_for_each_entry(buf, &info->buf_head, list) {
+               if (buf->status == UIP_BUF_STATUS_USED) {
+                       /*
+                        * Set status to INUSE immediately to prevent
+                        * someone from using this buf until we free it
+                        */
+                       buf->status = UIP_BUF_STATUS_INUSE;
+                       info->buf_used_nr--;
+                       found = true;
+                       break;
+               }
+       }
+
+       mutex_unlock(&info->buf_lock);
+
+       return found ? buf : NULL;
+}
+
+struct uip_buf *uip_buf_get_free(struct uip_info *info)
+{
+       struct uip_buf *buf;
+       bool found = false;
+
+       mutex_lock(&info->buf_lock);
+
+       while (!(info->buf_free_nr > 0))
+               pthread_cond_wait(&info->buf_free_cond, &info->buf_lock);
+
+       list_for_each_entry(buf, &info->buf_head, list) {
+               if (buf->status == UIP_BUF_STATUS_FREE) {
+                       /*
+                        * Set status to INUSE immediately to prevent
+                        * someone from using this buf until we free it
+                        */
+                       buf->status = UIP_BUF_STATUS_INUSE;
+                       info->buf_free_nr--;
+                       found = true;
+                       break;
+               }
+       }
+
+       mutex_unlock(&info->buf_lock);
+
+       return found ? buf : NULL;
+}
+
+struct uip_buf *uip_buf_set_used(struct uip_info *info, struct uip_buf *buf)
+{
+       mutex_lock(&info->buf_lock);
+
+       buf->status = UIP_BUF_STATUS_USED;
+       info->buf_used_nr++;
+       pthread_cond_signal(&info->buf_used_cond);
+
+       mutex_unlock(&info->buf_lock);
+
+       return buf;
+}
+
+struct uip_buf *uip_buf_set_free(struct uip_info *info, struct uip_buf *buf)
+{
+       mutex_lock(&info->buf_lock);
+
+       buf->status = UIP_BUF_STATUS_FREE;
+       info->buf_free_nr++;
+       pthread_cond_signal(&info->buf_free_cond);
+
+       mutex_unlock(&info->buf_lock);
+
+       return buf;
+}
+
+struct uip_buf *uip_buf_clone(struct uip_tx_arg *arg)
+{
+       struct uip_buf *buf;
+       struct uip_eth *eth2;
+       struct uip_info *info;
+
+       info = arg->info;
+
+       /*
+        * Get buffer from device to guest
+        */
+       buf = uip_buf_get_free(info);
+
+       /*
+        * Clone buffer
+        */
+       memcpy(buf->vnet, arg->vnet, arg->vnet_len);
+       memcpy(buf->eth, arg->eth, arg->eth_len);
+       buf->vnet_len   = arg->vnet_len;
+       buf->eth_len    = arg->eth_len;
+
+       eth2            = (struct uip_eth *)buf->eth;
+       eth2->src       = info->host_mac;
+       eth2->dst       = arg->eth->src;
+
+       return buf;
+}
diff --git a/tools/kvm/net/uip/core.c b/tools/kvm/net/uip/core.c

new file mode 100644 (file)

index 0000000..2e7603c
--- /dev/null
+++ b/tools/kvm/net/uip/core.c
@@ -0,0 +1,190 @@
+#include "kvm/mutex.h"
+#include "kvm/uip.h"
+
+#include <linux/virtio_net.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+int uip_tx(struct iovec *iov, u16 out, struct uip_info *info)
+{
+       struct virtio_net_hdr *vnet;
+       struct uip_tx_arg arg;
+       int eth_len, vnet_len;
+       struct uip_eth *eth;
+       u8 *buf = NULL;
+       u16 proto;
+       int i;
+
+       /*
+        * Buffer from guest to device
+        */
+       vnet_len = iov[0].iov_len;
+       vnet     = iov[0].iov_base;
+
+       eth_len  = iov[1].iov_len;
+       eth      = iov[1].iov_base;
+
+       /*
+        * In case, ethernet frame is in more than one iov entry.
+        * Copy iov buffer into one linear buffer.
+        */
+       if (out > 2) {
+               eth_len = 0;
+               for (i = 1; i < out; i++)
+                       eth_len += iov[i].iov_len;
+
+               buf = malloc(eth_len);
+               if (!buf)
+                       return -1;
+
+               eth = (struct uip_eth *)buf;
+               for (i = 1; i < out; i++) {
+                       memcpy(buf, iov[i].iov_base, iov[i].iov_len);
+                       buf += iov[i].iov_len;
+               }
+       }
+
+       memset(&arg, 0, sizeof(arg));
+
+       arg.vnet_len = vnet_len;
+       arg.eth_len = eth_len;
+       arg.info = info;
+       arg.vnet = vnet;
+       arg.eth = eth;
+
+       /*
+        * Check package type
+        */
+       proto = ntohs(eth->type);
+
+       switch (proto) {
+       case UIP_ETH_P_ARP:
+               uip_tx_do_arp(&arg);
+               break;
+       case UIP_ETH_P_IP:
+               uip_tx_do_ipv4(&arg);
+               break;
+       default:
+               break;
+       }
+
+       if (out > 2 && buf)
+               free(eth);
+
+       return vnet_len + eth_len;
+}
+
+int uip_rx(struct iovec *iov, u16 in, struct uip_info *info)
+{
+       struct virtio_net_hdr *vnet;
+       struct uip_eth *eth;
+       struct uip_buf *buf;
+       int vnet_len;
+       int eth_len;
+       char *p;
+       int len;
+       int cnt;
+       int i;
+
+       /*
+        * Sleep until there is a buffer for guest
+        */
+       buf = uip_buf_get_used(info);
+
+       /*
+        * Fill device to guest buffer, vnet hdr fisrt
+        */
+       vnet_len = iov[0].iov_len;
+       vnet = iov[0].iov_base;
+       if (buf->vnet_len > vnet_len) {
+               len = -1;
+               goto out;
+       }
+       memcpy(vnet, buf->vnet, buf->vnet_len);
+
+       /*
+        * Then, the real eth data
+        * Note: Be sure buf->eth_len is not bigger than the buffer len that guest provides
+        */
+       cnt = buf->eth_len;
+       p = buf->eth;
+       for (i = 1; i < in; i++) {
+               eth_len = iov[i].iov_len;
+               eth = iov[i].iov_base;
+               if (cnt > eth_len) {
+                       memcpy(eth, p, eth_len);
+                       cnt -= eth_len;
+                       p += eth_len;
+               } else {
+                       memcpy(eth, p, cnt);
+                       cnt -= cnt;
+                       break;
+               }
+       }
+
+       if (cnt) {
+               pr_warning("uip_rx error");
+               len = -1;
+               goto out;
+       }
+
+       len = buf->vnet_len + buf->eth_len;
+
+out:
+       uip_buf_set_free(info, buf);
+       return len;
+}
+
+int uip_init(struct uip_info *info)
+{
+       struct list_head *udp_socket_head;
+       struct list_head *tcp_socket_head;
+       struct list_head *buf_head;
+       struct uip_buf *buf;
+       int buf_nr;
+       int i;
+
+       udp_socket_head = &info->udp_socket_head;
+       tcp_socket_head = &info->tcp_socket_head;
+       buf_head        = &info->buf_head;
+       buf_nr          = info->buf_nr;
+
+       INIT_LIST_HEAD(udp_socket_head);
+       INIT_LIST_HEAD(tcp_socket_head);
+       INIT_LIST_HEAD(buf_head);
+
+       pthread_mutex_init(&info->udp_socket_lock, NULL);
+       pthread_mutex_init(&info->tcp_socket_lock, NULL);
+       pthread_mutex_init(&info->buf_lock, NULL);
+
+       pthread_cond_init(&info->buf_used_cond, NULL);
+       pthread_cond_init(&info->buf_free_cond, NULL);
+
+
+       for (i = 0; i < buf_nr; i++) {
+               buf = malloc(sizeof(*buf));
+               memset(buf, 0, sizeof(*buf));
+
+               buf->status     = UIP_BUF_STATUS_FREE;
+               buf->info       = info;
+               buf->id         = i;
+               list_add_tail(&buf->list, buf_head);
+       }
+
+       list_for_each_entry(buf, buf_head, list) {
+               buf->vnet       = malloc(sizeof(struct virtio_net_hdr));
+               buf->vnet_len   = sizeof(struct virtio_net_hdr);
+               buf->eth        = malloc(1024*64 + sizeof(struct uip_pseudo_hdr));
+               buf->eth_len    = 1024*64 + sizeof(struct uip_pseudo_hdr);
+
+               memset(buf->vnet, 0, buf->vnet_len);
+               memset(buf->eth, 0, buf->eth_len);
+       }
+
+       info->buf_free_nr = buf_nr;
+       info->buf_used_nr = 0;
+
+       uip_dhcp_get_dns(info);
+
+       return 0;
+}
diff --git a/tools/kvm/net/uip/csum.c b/tools/kvm/net/uip/csum.c

new file mode 100644 (file)

index 0000000..7ca8bad
--- /dev/null
+++ b/tools/kvm/net/uip/csum.c
@@ -0,0 +1,92 @@
+#include "kvm/uip.h"
+
+static u16 uip_csum(u16 csum, u8 *addr, u16 count)
+{
+       long sum = csum;
+
+       while (count > 1) {
+               sum     += *(u16 *)addr;
+               addr    += 2;
+               count   -= 2;
+       }
+
+       if (count > 0)
+               sum += *(unsigned char *)addr;
+
+       while (sum>>16)
+               sum = (sum & 0xffff) + (sum >> 16);
+
+       return ~sum;
+}
+
+u16 uip_csum_ip(struct uip_ip *ip)
+{
+       return uip_csum(0, &ip->vhl, uip_ip_hdrlen(ip));
+}
+
+u16 uip_csum_icmp(struct uip_icmp *icmp)
+{
+       struct uip_ip *ip;
+
+       ip = &icmp->ip;
+       return icmp->csum = uip_csum(0, &icmp->type, htons(ip->len) - uip_ip_hdrlen(ip) - 8); /* icmp header len = 8 */
+}
+
+u16 uip_csum_udp(struct uip_udp *udp)
+{
+       struct uip_pseudo_hdr hdr;
+       struct uip_ip *ip;
+       int udp_len;
+       u8 *pad;
+
+       ip        = &udp->ip;
+
+       hdr.sip   = ip->sip;
+       hdr.dip   = ip->dip;
+       hdr.zero  = 0;
+       hdr.proto = ip->proto;
+       hdr.len   = udp->len;
+
+       udp_len   = uip_udp_len(udp);
+
+       if (udp_len % 2) {
+               pad = (u8 *)&udp->sport + udp_len;
+               *pad = 0;
+               memcpy((u8 *)&udp->sport + udp_len + 1, &hdr, sizeof(hdr));
+               return uip_csum(0, (u8 *)&udp->sport, udp_len + 1 + sizeof(hdr));
+       } else {
+               memcpy((u8 *)&udp->sport + udp_len, &hdr, sizeof(hdr));
+               return uip_csum(0, (u8 *)&udp->sport, udp_len + sizeof(hdr));
+       }
+
+}
+
+u16 uip_csum_tcp(struct uip_tcp *tcp)
+{
+       struct uip_pseudo_hdr hdr;
+       struct uip_ip *ip;
+       u16 tcp_len;
+       u8 *pad;
+
+       ip        = &tcp->ip;
+       tcp_len   = ntohs(ip->len) - uip_ip_hdrlen(ip);
+
+       hdr.sip   = ip->sip;
+       hdr.dip   = ip->dip;
+       hdr.zero  = 0;
+       hdr.proto = ip->proto;
+       hdr.len   = htons(tcp_len);
+
+       if (tcp_len > UIP_MAX_TCP_PAYLOAD + 20)
+               pr_warning("tcp_len(%d) is too large", tcp_len);
+
+       if (tcp_len % 2) {
+               pad = (u8 *)&tcp->sport + tcp_len;
+               *pad = 0;
+               memcpy((u8 *)&tcp->sport + tcp_len + 1, &hdr, sizeof(hdr));
+               return uip_csum(0, (u8 *)&tcp->sport, tcp_len + 1 + sizeof(hdr));
+       } else {
+               memcpy((u8 *)&tcp->sport + tcp_len, &hdr, sizeof(hdr));
+               return uip_csum(0, (u8 *)&tcp->sport, tcp_len + sizeof(hdr));
+       }
+}
diff --git a/tools/kvm/net/uip/dhcp.c b/tools/kvm/net/uip/dhcp.c

new file mode 100644 (file)

index 0000000..b17d352
--- /dev/null
+++ b/tools/kvm/net/uip/dhcp.c
@@ -0,0 +1,202 @@
+#include "kvm/uip.h"
+
+#include <arpa/inet.h>
+
+#define EMPTY_ADDR "0.0.0.0"
+
+static inline bool uip_dhcp_is_discovery(struct uip_dhcp *dhcp)
+{
+       return (dhcp->option[2] == UIP_DHCP_DISCOVER &&
+               dhcp->option[1] == UIP_DHCP_TAG_MSG_TYPE_LEN &&
+               dhcp->option[0] == UIP_DHCP_TAG_MSG_TYPE);
+}
+
+static inline bool uip_dhcp_is_request(struct uip_dhcp *dhcp)
+{
+       return (dhcp->option[2] == UIP_DHCP_REQUEST &&
+               dhcp->option[1] == UIP_DHCP_TAG_MSG_TYPE_LEN &&
+               dhcp->option[0] == UIP_DHCP_TAG_MSG_TYPE);
+}
+
+bool uip_udp_is_dhcp(struct uip_udp *udp)
+{
+       struct uip_dhcp *dhcp;
+
+       if (ntohs(udp->sport) != UIP_DHCP_PORT_CLIENT ||
+           ntohs(udp->dport) != UIP_DHCP_PORT_SERVER)
+               return false;
+
+       dhcp = (struct uip_dhcp *)udp;
+
+       if (ntohl(dhcp->magic_cookie) != UIP_DHCP_MAGIC_COOKIE)
+               return false;
+
+       return true;
+}
+
+int uip_dhcp_get_dns(struct uip_info *info)
+{
+       char key[256], val[256];
+       struct in_addr addr;
+       int ret = -1;
+       int n = 0;
+       FILE *fp;
+       u32 ip;
+
+       fp = fopen("/etc/resolv.conf", "r");
+       if (!fp)
+               return ret;
+
+       while (!feof(fp)) {
+               if (fscanf(fp, "%s %s\n", key, val) != 2)
+                       continue;
+               if (strncmp("domain", key, 6) == 0)
+                       info->domain_name = strndup(val, UIP_DHCP_MAX_DOMAIN_NAME_LEN);
+               else if (strncmp("nameserver", key, 10) == 0) {
+                       if (!inet_aton(val, &addr))
+                               continue;
+                       ip = ntohl(addr.s_addr);
+                       if (n < UIP_DHCP_MAX_DNS_SERVER_NR)
+                               info->dns_ip[n++] = ip;
+                       ret = 0;
+               }
+       }
+
+       fclose(fp);
+       return ret;
+}
+
+static int uip_dhcp_fill_option_name_and_server(struct uip_info *info, u8 *opt, int i)
+{
+       u8 domain_name_len;
+       u32 *addr;
+       int n;
+
+       if (info->domain_name) {
+               domain_name_len = strlen(info->domain_name);
+               opt[i++]        = UIP_DHCP_TAG_DOMAIN_NAME;
+               opt[i++]        = domain_name_len;
+               memcpy(&opt[i], info->domain_name, domain_name_len);
+               i               += domain_name_len;
+       }
+
+       for (n = 0; n < UIP_DHCP_MAX_DNS_SERVER_NR; n++) {
+               if (info->dns_ip[n] == 0)
+                       continue;
+               opt[i++]        = UIP_DHCP_TAG_DNS_SERVER;
+               opt[i++]        = UIP_DHCP_TAG_DNS_SERVER_LEN;
+               addr            = (u32 *)&opt[i];
+               *addr           = htonl(info->dns_ip[n]);
+               i               += UIP_DHCP_TAG_DNS_SERVER_LEN;
+       }
+
+       return i;
+}
+static int uip_dhcp_fill_option(struct uip_info *info, struct uip_dhcp *dhcp, int reply_msg_type)
+{
+       int i = 0;
+       u32 *addr;
+       u8 *opt;
+
+       opt             = dhcp->option;
+
+       opt[i++]        = UIP_DHCP_TAG_MSG_TYPE;
+       opt[i++]        = UIP_DHCP_TAG_MSG_TYPE_LEN;
+       opt[i++]        = reply_msg_type;
+
+       opt[i++]        = UIP_DHCP_TAG_SERVER_ID;
+       opt[i++]        = UIP_DHCP_TAG_SERVER_ID_LEN;
+       addr            = (u32 *)&opt[i];
+       *addr           = htonl(info->host_ip);
+       i               += UIP_DHCP_TAG_SERVER_ID_LEN;
+
+       opt[i++]        = UIP_DHCP_TAG_LEASE_TIME;
+       opt[i++]        = UIP_DHCP_TAG_LEASE_TIME_LEN;
+       addr            = (u32 *)&opt[i];
+       *addr           = htonl(UIP_DHCP_LEASE_TIME);
+       i               += UIP_DHCP_TAG_LEASE_TIME_LEN;
+
+       opt[i++]        = UIP_DHCP_TAG_SUBMASK;
+       opt[i++]        = UIP_DHCP_TAG_SUBMASK_LEN;
+       addr            = (u32 *)&opt[i];
+       *addr           = htonl(info->guest_netmask);
+       i               += UIP_DHCP_TAG_SUBMASK_LEN;
+
+       opt[i++]        = UIP_DHCP_TAG_ROUTER;
+       opt[i++]        = UIP_DHCP_TAG_ROUTER_LEN;
+       addr            = (u32 *)&opt[i];
+       *addr           = htonl(info->host_ip);
+       i               += UIP_DHCP_TAG_ROUTER_LEN;
+
+       opt[i++]        = UIP_DHCP_TAG_ROOT;
+       opt[i++]        = strlen(EMPTY_ADDR);
+       addr            = (u32 *)&opt[i];
+       strncpy((void *) addr, EMPTY_ADDR, strlen(EMPTY_ADDR));
+       i               += strlen(EMPTY_ADDR);
+
+       i               = uip_dhcp_fill_option_name_and_server(info, opt, i);
+
+       opt[i++]        = UIP_DHCP_TAG_END;
+
+       return 0;
+}
+
+static int uip_dhcp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8 reply_msg_type)
+{
+       struct uip_dhcp *dhcp;
+
+       dhcp            = (struct uip_dhcp *)buf->eth;
+
+       dhcp->msg_type  = 2;
+       dhcp->client_ip = 0;
+       dhcp->your_ip   = htonl(info->guest_ip);
+       dhcp->server_ip = htonl(info->host_ip);
+       dhcp->agent_ip  = 0;
+
+       uip_dhcp_fill_option(info, dhcp, reply_msg_type);
+
+       sk->sip         = htonl(info->guest_ip);
+       sk->dip         = htonl(info->host_ip);
+       sk->sport       = htons(UIP_DHCP_PORT_CLIENT);
+       sk->dport       = htons(UIP_DHCP_PORT_SERVER);
+
+       return 0;
+}
+
+int uip_tx_do_ipv4_udp_dhcp(struct uip_tx_arg *arg)
+{
+       struct uip_udp_socket sk;
+       struct uip_dhcp *dhcp;
+       struct uip_info *info;
+       struct uip_buf *buf;
+       u8 reply_msg_type;
+
+       dhcp = (struct uip_dhcp *)arg->eth;
+
+       if (uip_dhcp_is_discovery(dhcp))
+               reply_msg_type = UIP_DHCP_OFFER;
+       else if (uip_dhcp_is_request(dhcp))
+               reply_msg_type = UIP_DHCP_ACK;
+       else
+               return -1;
+
+       buf = uip_buf_clone(arg);
+       info = arg->info;
+
+       /*
+        * Cook DHCP pkg
+        */
+       uip_dhcp_make_pkg(info, &sk, buf, reply_msg_type);
+
+       /*
+        * Cook UDP pkg
+        */
+       uip_udp_make_pkg(info, &sk, buf, NULL, UIP_DHCP_MAX_PAYLOAD_LEN);
+
+       /*
+        * Send data received from socket to guest
+        */
+       uip_buf_set_used(info, buf);
+
+       return 0;
+}
diff --git a/tools/kvm/net/uip/icmp.c b/tools/kvm/net/uip/icmp.c

new file mode 100644 (file)

index 0000000..233297c
--- /dev/null
+++ b/tools/kvm/net/uip/icmp.c
@@ -0,0 +1,29 @@
+#include "kvm/uip.h"
+
+int uip_tx_do_ipv4_icmp(struct uip_tx_arg *arg)
+{
+       struct uip_ip *ip, *ip2;
+       struct uip_icmp *icmp2;
+       struct uip_buf *buf;
+
+       buf             = uip_buf_clone(arg);
+
+       icmp2           = (struct uip_icmp *)(buf->eth);
+       ip2             = (struct uip_ip *)(buf->eth);
+       ip              = (struct uip_ip *)(arg->eth);
+
+       ip2->sip        = ip->dip;
+       ip2->dip        = ip->sip;
+       ip2->csum       = 0;
+       /*
+        * ICMP reply: 0
+        */
+       icmp2->type     = 0;
+       icmp2->csum     = 0;
+       ip2->csum       = uip_csum_ip(ip2);
+       icmp2->csum     = uip_csum_icmp(icmp2);
+
+       uip_buf_set_used(arg->info, buf);
+
+       return 0;
+}
diff --git a/tools/kvm/net/uip/ipv4.c b/tools/kvm/net/uip/ipv4.c

new file mode 100644 (file)

index 0000000..58373fd
--- /dev/null
+++ b/tools/kvm/net/uip/ipv4.c
@@ -0,0 +1,29 @@
+#include "kvm/uip.h"
+
+int uip_tx_do_ipv4(struct uip_tx_arg *arg)
+{
+       struct uip_ip *ip;
+
+       ip = (struct uip_ip *)(arg->eth);
+
+       if (uip_ip_hdrlen(ip) != 20) {
+               pr_warning("IP header length is not 20 bytes");
+               return -1;
+       }
+
+       switch (ip->proto) {
+       case UIP_IP_P_ICMP:
+               uip_tx_do_ipv4_icmp(arg);
+               break;
+       case UIP_IP_P_TCP:
+               uip_tx_do_ipv4_tcp(arg);
+               break;
+       case UIP_IP_P_UDP:
+               uip_tx_do_ipv4_udp(arg);
+               break;
+       default:
+               break;
+       }
+
+       return 0;
+}
diff --git a/tools/kvm/net/uip/tcp.c b/tools/kvm/net/uip/tcp.c

new file mode 100644 (file)

index 0000000..711a716
--- /dev/null
+++ b/tools/kvm/net/uip/tcp.c
@@ -0,0 +1,345 @@
+#include "kvm/uip.h"
+
+#include <linux/virtio_net.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <arpa/inet.h>
+
+static int uip_tcp_socket_close(struct uip_tcp_socket *sk, int how)
+{
+       shutdown(sk->fd, how);
+
+       if (sk->write_done && sk->read_done) {
+               shutdown(sk->fd, SHUT_RDWR);
+               close(sk->fd);
+
+               mutex_lock(sk->lock);
+               list_del(&sk->list);
+               mutex_unlock(sk->lock);
+
+               free(sk);
+       }
+
+       return 0;
+}
+
+static struct uip_tcp_socket *uip_tcp_socket_find(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport)
+{
+       struct list_head *sk_head;
+       pthread_mutex_t *sk_lock;
+       struct uip_tcp_socket *sk;
+
+       sk_head = &arg->info->tcp_socket_head;
+       sk_lock = &arg->info->tcp_socket_lock;
+
+       mutex_lock(sk_lock);
+       list_for_each_entry(sk, sk_head, list) {
+               if (sk->sip == sip && sk->dip == dip && sk->sport == sport && sk->dport == dport) {
+                       mutex_unlock(sk_lock);
+                       return sk;
+               }
+       }
+       mutex_unlock(sk_lock);
+
+       return NULL;
+}
+
+static struct uip_tcp_socket *uip_tcp_socket_alloc(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport)
+{
+       struct list_head *sk_head;
+       struct uip_tcp_socket *sk;
+       pthread_mutex_t *sk_lock;
+       struct uip_tcp *tcp;
+       struct uip_ip *ip;
+       int ret;
+
+       tcp = (struct uip_tcp *)arg->eth;
+       ip = (struct uip_ip *)arg->eth;
+
+       sk_head = &arg->info->tcp_socket_head;
+       sk_lock = &arg->info->tcp_socket_lock;
+
+       sk = malloc(sizeof(*sk));
+       memset(sk, 0, sizeof(*sk));
+
+       sk->lock                        = sk_lock;
+       sk->info                        = arg->info;
+
+       sk->fd                          = socket(AF_INET, SOCK_STREAM, 0);
+       sk->addr.sin_family             = AF_INET;
+       sk->addr.sin_port               = dport;
+       sk->addr.sin_addr.s_addr        = dip;
+
+       pthread_cond_init(&sk->cond, NULL);
+
+       if (ntohl(dip) == arg->info->host_ip)
+               sk->addr.sin_addr.s_addr = inet_addr("127.0.0.1");
+
+       ret = connect(sk->fd, (struct sockaddr *)&sk->addr, sizeof(sk->addr));
+       if (ret) {
+               free(sk);
+               return NULL;
+       }
+
+       sk->sip         = ip->sip;
+       sk->dip         = ip->dip;
+       sk->sport       = tcp->sport;
+       sk->dport       = tcp->dport;
+
+       mutex_lock(sk_lock);
+       list_add_tail(&sk->list, sk_head);
+       mutex_unlock(sk_lock);
+
+       return sk;
+}
+
+static int uip_tcp_payload_send(struct uip_tcp_socket *sk, u8 flag, u16 payload_len)
+{
+       struct uip_info *info;
+       struct uip_eth *eth2;
+       struct uip_tcp *tcp2;
+       struct uip_buf *buf;
+       struct uip_ip *ip2;
+
+       info            = sk->info;
+
+       /*
+        * Get free buffer to send data to guest
+        */
+       buf             = uip_buf_get_free(info);
+
+       /*
+        * Cook a ethernet frame
+        */
+       tcp2            = (struct uip_tcp *)buf->eth;
+       eth2            = (struct uip_eth *)buf->eth;
+       ip2             = (struct uip_ip *)buf->eth;
+
+       eth2->src       = info->host_mac;
+       eth2->dst       = info->guest_mac;
+       eth2->type      = htons(UIP_ETH_P_IP);
+
+       ip2->vhl        = UIP_IP_VER_4 | UIP_IP_HDR_LEN;
+       ip2->tos        = 0;
+       ip2->id         = 0;
+       ip2->flgfrag    = 0;
+       ip2->ttl        = UIP_IP_TTL;
+       ip2->proto      = UIP_IP_P_TCP;
+       ip2->csum       = 0;
+       ip2->sip        = sk->dip;
+       ip2->dip        = sk->sip;
+
+       tcp2->sport     = sk->dport;
+       tcp2->dport     = sk->sport;
+       tcp2->seq       = htonl(sk->seq_server);
+       tcp2->ack       = htonl(sk->ack_server);
+       /*
+        * Diable TCP options, tcp hdr len equals 20 bytes
+        */
+       tcp2->off       = UIP_TCP_HDR_LEN;
+       tcp2->flg       = flag;
+       tcp2->win       = htons(UIP_TCP_WIN_SIZE);
+       tcp2->csum      = 0;
+       tcp2->urgent    = 0;
+
+       if (payload_len > 0)
+               memcpy(uip_tcp_payload(tcp2), sk->payload, payload_len);
+
+       ip2->len        = htons(uip_tcp_hdrlen(tcp2) + payload_len + uip_ip_hdrlen(ip2));
+       ip2->csum       = uip_csum_ip(ip2);
+       tcp2->csum      = uip_csum_tcp(tcp2);
+
+       /*
+        * virtio_net_hdr
+        */
+       buf->vnet_len   = sizeof(struct virtio_net_hdr);
+       memset(buf->vnet, 0, buf->vnet_len);
+
+       buf->eth_len    = ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
+
+       /*
+        * Increase server seq
+        */
+       sk->seq_server  += payload_len;
+
+       /*
+        * Send data received from socket to guest
+        */
+       uip_buf_set_used(info, buf);
+
+       return 0;
+}
+
+static void *uip_tcp_socket_thread(void *p)
+{
+       struct uip_tcp_socket *sk;
+       int len, left, ret;
+       u8 *payload, *pos;
+
+       sk = p;
+
+       payload = malloc(UIP_MAX_TCP_PAYLOAD);
+       if (!payload)
+               goto out;
+
+       while (1) {
+               pos = payload;
+
+               ret = read(sk->fd, payload, UIP_MAX_TCP_PAYLOAD);
+
+               if (ret <= 0 || ret > UIP_MAX_TCP_PAYLOAD)
+                       goto out;
+
+               left = ret;
+
+               while (left > 0) {
+                       mutex_lock(sk->lock);
+                       while ((len = sk->guest_acked + sk->window_size - sk->seq_server) <= 0)
+                               pthread_cond_wait(&sk->cond, sk->lock);
+                       mutex_unlock(sk->lock);
+
+                       sk->payload = pos;
+                       if (len > left)
+                               len = left;
+                       if (len > UIP_MAX_TCP_PAYLOAD)
+                               len = UIP_MAX_TCP_PAYLOAD;
+                       left -= len;
+                       pos += len;
+
+                       uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, len);
+               }
+       }
+
+out:
+       /*
+        * Close server to guest TCP connection
+        */
+       uip_tcp_socket_close(sk, SHUT_RD);
+
+       uip_tcp_payload_send(sk, UIP_TCP_FLAG_FIN | UIP_TCP_FLAG_ACK, 0);
+       sk->seq_server += 1;
+
+       sk->read_done = 1;
+
+       free(payload);
+       pthread_exit(NULL);
+
+       return NULL;
+}
+
+static int uip_tcp_socket_receive(struct uip_tcp_socket *sk)
+{
+       if (sk->thread == 0)
+               return pthread_create(&sk->thread, NULL, uip_tcp_socket_thread, (void *)sk);
+
+       return 0;
+}
+
+static int uip_tcp_socket_send(struct uip_tcp_socket *sk, struct uip_tcp *tcp)
+{
+       int len;
+       int ret;
+       u8 *payload;
+
+       if (sk->write_done)
+               return 0;
+
+       payload = uip_tcp_payload(tcp);
+       len = uip_tcp_payloadlen(tcp);
+
+       ret = write(sk->fd, payload, len);
+       if (ret != len)
+               pr_warning("tcp send error");
+
+       return ret;
+}
+
+int uip_tx_do_ipv4_tcp(struct uip_tx_arg *arg)
+{
+       struct uip_tcp_socket *sk;
+       struct uip_tcp *tcp;
+       struct uip_ip *ip;
+       int ret;
+
+       tcp = (struct uip_tcp *)arg->eth;
+       ip = (struct uip_ip *)arg->eth;
+
+       /*
+        * Guest is trying to start a TCP session, let's fake SYN-ACK to guest
+        */
+       if (uip_tcp_is_syn(tcp)) {
+               sk = uip_tcp_socket_alloc(arg, ip->sip, ip->dip, tcp->sport, tcp->dport);
+               if (!sk)
+                       return -1;
+
+               sk->window_size = ntohs(tcp->win);
+
+               /*
+                * Setup ISN number
+                */
+               sk->isn_guest  = uip_tcp_isn(tcp);
+               sk->isn_server = uip_tcp_isn_alloc();
+
+               sk->seq_server = sk->isn_server;
+               sk->ack_server = sk->isn_guest + 1;
+               uip_tcp_payload_send(sk, UIP_TCP_FLAG_SYN | UIP_TCP_FLAG_ACK, 0);
+               sk->seq_server += 1;
+
+               /*
+                * Start receive thread for data from remote to guest
+                */
+               uip_tcp_socket_receive(sk);
+
+               goto out;
+       }
+
+       /*
+        * Find socket we have allocated
+        */
+       sk = uip_tcp_socket_find(arg, ip->sip, ip->dip, tcp->sport, tcp->dport);
+       if (!sk)
+               return -1;
+
+       mutex_lock(sk->lock);
+       sk->window_size = ntohs(tcp->win);
+       sk->guest_acked = ntohl(tcp->ack);
+       pthread_cond_signal(&sk->cond);
+       mutex_unlock(sk->lock);
+
+       if (uip_tcp_is_fin(tcp)) {
+               if (sk->write_done)
+                       goto out;
+
+               sk->write_done = 1;
+               sk->ack_server += 1;
+               uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, 0);
+
+               /*
+                * Close guest to server TCP connection
+                */
+               uip_tcp_socket_close(sk, SHUT_WR);
+
+               goto out;
+       }
+
+       /*
+        * Ignore guest to server frames with zero tcp payload
+        */
+       if (uip_tcp_payloadlen(tcp) == 0)
+               goto out;
+
+       /*
+        * Sent out TCP data to remote host
+        */
+       ret = uip_tcp_socket_send(sk, tcp);
+       if (ret < 0)
+               return -1;
+       /*
+        * Send ACK to guest imediately
+        */
+       sk->ack_server += ret;
+       uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, 0);
+
+out:
+       return 0;
+}
diff --git a/tools/kvm/net/uip/udp.c b/tools/kvm/net/uip/udp.c

new file mode 100644 (file)

index 0000000..d4518b2
--- /dev/null
+++ b/tools/kvm/net/uip/udp.c
@@ -0,0 +1,236 @@
+#include "kvm/uip.h"
+
+#include <linux/virtio_net.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <sys/socket.h>
+#include <sys/epoll.h>
+#include <fcntl.h>
+
+#define UIP_UDP_MAX_EVENTS 1000
+
+static struct uip_udp_socket *uip_udp_socket_find(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport)
+{
+       struct list_head *sk_head;
+       struct uip_udp_socket *sk;
+       pthread_mutex_t *sk_lock;
+       struct epoll_event ev;
+       int flags;
+       int ret;
+
+       sk_head = &arg->info->udp_socket_head;
+       sk_lock = &arg->info->udp_socket_lock;
+
+       /*
+        * Find existing sk
+        */
+       mutex_lock(sk_lock);
+       list_for_each_entry(sk, sk_head, list) {
+               if (sk->sip == sip && sk->dip == dip && sk->sport == sport && sk->dport == dport) {
+                       mutex_unlock(sk_lock);
+                       return sk;
+               }
+       }
+       mutex_unlock(sk_lock);
+
+       /*
+        * Allocate new one
+        */
+       sk = malloc(sizeof(*sk));
+       memset(sk, 0, sizeof(*sk));
+
+       sk->lock = sk_lock;
+
+       sk->fd = socket(AF_INET, SOCK_DGRAM, 0);
+       if (sk->fd < 0)
+               goto out;
+
+       /*
+        * Set non-blocking
+        */
+       flags = fcntl(sk->fd, F_GETFL, 0);
+       flags |= O_NONBLOCK;
+       fcntl(sk->fd, F_SETFL, flags);
+
+       /*
+        * Add sk->fd to epoll_wait
+        */
+       ev.events       = EPOLLIN;
+       ev.data.fd      = sk->fd;
+       ev.data.ptr     = sk;
+       if (arg->info->udp_epollfd <= 0)
+               arg->info->udp_epollfd = epoll_create(UIP_UDP_MAX_EVENTS);
+       ret = epoll_ctl(arg->info->udp_epollfd, EPOLL_CTL_ADD, sk->fd, &ev);
+       if (ret == -1)
+               pr_warning("epoll_ctl error");
+
+       sk->addr.sin_family      = AF_INET;
+       sk->addr.sin_addr.s_addr = dip;
+       sk->addr.sin_port        = dport;
+
+       sk->sip                  = sip;
+       sk->dip                  = dip;
+       sk->sport                = sport;
+       sk->dport                = dport;
+
+       mutex_lock(sk_lock);
+       list_add_tail(&sk->list, sk_head);
+       mutex_unlock(sk_lock);
+
+       return sk;
+
+out:
+       free(sk);
+       return NULL;
+}
+
+static int uip_udp_socket_send(struct uip_udp_socket *sk, struct uip_udp *udp)
+{
+       int len;
+       int ret;
+
+       len = ntohs(udp->len) - uip_udp_hdrlen(udp);
+
+       ret = sendto(sk->fd, udp->payload, len, 0, (struct sockaddr *)&sk->addr, sizeof(sk->addr));
+       if (ret != len)
+               return -1;
+
+       return 0;
+}
+
+int uip_udp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8* payload, int payload_len)
+{
+       struct uip_eth *eth2;
+       struct uip_udp *udp2;
+       struct uip_ip *ip2;
+
+       /*
+        * Cook a ethernet frame
+        */
+       udp2            = (struct uip_udp *)(buf->eth);
+       eth2            = (struct uip_eth *)buf->eth;
+       ip2             = (struct uip_ip *)(buf->eth);
+
+       eth2->src       = info->host_mac;
+       eth2->dst       = info->guest_mac;
+       eth2->type      = htons(UIP_ETH_P_IP);
+
+       ip2->vhl        = UIP_IP_VER_4 | UIP_IP_HDR_LEN;
+       ip2->tos        = 0;
+       ip2->id         = 0;
+       ip2->flgfrag    = 0;
+       ip2->ttl        = UIP_IP_TTL;
+       ip2->proto      = UIP_IP_P_UDP;
+       ip2->csum       = 0;
+
+       ip2->sip        = sk->dip;
+       ip2->dip        = sk->sip;
+       udp2->sport     = sk->dport;
+       udp2->dport     = sk->sport;
+
+       udp2->len       = htons(payload_len + uip_udp_hdrlen(udp2));
+       udp2->csum      = 0;
+
+       if (payload)
+               memcpy(udp2->payload, payload, payload_len);
+
+       ip2->len        = udp2->len + htons(uip_ip_hdrlen(ip2));
+       ip2->csum       = uip_csum_ip(ip2);
+       udp2->csum      = uip_csum_udp(udp2);
+
+       /*
+        * virtio_net_hdr
+        */
+       buf->vnet_len   = sizeof(struct virtio_net_hdr);
+       memset(buf->vnet, 0, buf->vnet_len);
+
+       buf->eth_len    = ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
+
+       return 0;
+}
+
+static void *uip_udp_socket_thread(void *p)
+{
+       struct epoll_event events[UIP_UDP_MAX_EVENTS];
+       struct uip_udp_socket *sk;
+       struct uip_info *info;
+       struct uip_buf *buf;
+       int payload_len;
+       u8 *payload;
+       int nfds;
+       int i;
+
+       info = p;
+
+       do {
+               payload = malloc(UIP_MAX_UDP_PAYLOAD);
+       } while (!payload);
+
+       while (1) {
+               nfds = epoll_wait(info->udp_epollfd, events, UIP_UDP_MAX_EVENTS, -1);
+
+               if (nfds == -1)
+                       continue;
+
+               for (i = 0; i < nfds; i++) {
+
+                       sk = events[i].data.ptr;
+                       payload_len = recvfrom(sk->fd, payload, UIP_MAX_UDP_PAYLOAD, 0, NULL, NULL);
+                       if (payload_len < 0)
+                               continue;
+
+                       /*
+                        * Get free buffer to send data to guest
+                        */
+                       buf = uip_buf_get_free(info);
+
+                       uip_udp_make_pkg(info, sk, buf, payload, payload_len);
+
+                       /*
+                        * Send data received from socket to guest
+                        */
+                       uip_buf_set_used(info, buf);
+               }
+       }
+
+       free(payload);
+       pthread_exit(NULL);
+       return NULL;
+}
+
+int uip_tx_do_ipv4_udp(struct uip_tx_arg *arg)
+{
+       struct uip_udp_socket *sk;
+       struct uip_info *info;
+       struct uip_udp *udp;
+       struct uip_ip *ip;
+       int ret;
+
+       udp     = (struct uip_udp *)(arg->eth);
+       ip      = (struct uip_ip *)(arg->eth);
+       info    = arg->info;
+
+       if (uip_udp_is_dhcp(udp)) {
+               uip_tx_do_ipv4_udp_dhcp(arg);
+               return 0;
+       }
+
+       /*
+        * Find socket we have allocated before, otherwise allocate one
+        */
+       sk = uip_udp_socket_find(arg, ip->sip, ip->dip, udp->sport, udp->dport);
+       if (!sk)
+               return -1;
+
+       /*
+        * Send out UDP data to remote host
+        */
+       ret = uip_udp_socket_send(sk, udp);
+       if (ret)
+               return -1;
+
+       if (!info->udp_thread)
+               pthread_create(&info->udp_thread, NULL, uip_udp_socket_thread, (void *)info);
+
+       return 0;
+}
diff --git a/tools/kvm/pci.c b/tools/kvm/pci.c

new file mode 100644 (file)

index 0000000..38f4778
--- /dev/null
+++ b/tools/kvm/pci.c
@@ -0,0 +1,213 @@
+#include "kvm/pci.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include <linux/err.h>
+#include <assert.h>
+
+#define PCI_BAR_OFFSET(b)              (offsetof(struct pci_device_header, bar[b]))
+
+static struct pci_device_header                *pci_devices[PCI_MAX_DEVICES];
+
+static union pci_config_address                pci_config_address;
+
+/* This is within our PCI gap - in an unused area.
+ * Note this is a PCI *bus address*, is used to assign BARs etc.!
+ * (That's why it can still 32bit even with 64bit guests-- 64bit
+ * PCI isn't currently supported.)
+ */
+static u32 io_space_blocks             = KVM_PCI_MMIO_AREA;
+
+u32 pci_get_io_space_block(u32 size)
+{
+       u32 block = io_space_blocks;
+       io_space_blocks += size;
+
+       return block;
+}
+
+static void *pci_config_address_ptr(u16 port)
+{
+       unsigned long offset;
+       void *base;
+
+       offset  = port - PCI_CONFIG_ADDRESS;
+       base    = &pci_config_address;
+
+       return base + offset;
+}
+
+static bool pci_config_address_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       void *p = pci_config_address_ptr(port);
+
+       memcpy(p, data, size);
+
+       return true;
+}
+
+static bool pci_config_address_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       void *p = pci_config_address_ptr(port);
+
+       memcpy(data, p, size);
+
+       return true;
+}
+
+static struct ioport_operations pci_config_address_ops = {
+       .io_in  = pci_config_address_in,
+       .io_out = pci_config_address_out,
+};
+
+static bool pci_device_exists(u8 bus_number, u8 device_number, u8 function_number)
+{
+       struct pci_device_header *dev;
+
+       if (pci_config_address.bus_number != bus_number)
+               return false;
+
+       if (pci_config_address.function_number != function_number)
+               return false;
+
+       if (device_number >= PCI_MAX_DEVICES)
+               return false;
+
+       dev = pci_devices[device_number];
+
+       return dev != NULL;
+}
+
+static bool pci_config_data_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       /*
+        * If someone accesses PCI configuration space offsets that are not
+        * aligned to 4 bytes, it uses ioports to signify that.
+        */
+       pci_config_address.reg_offset = port - PCI_CONFIG_DATA;
+
+       pci__config_wr(kvm, pci_config_address, data, size);
+
+       return true;
+}
+
+static bool pci_config_data_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       /*
+        * If someone accesses PCI configuration space offsets that are not
+        * aligned to 4 bytes, it uses ioports to signify that.
+        */
+       pci_config_address.reg_offset = port - PCI_CONFIG_DATA;
+
+       pci__config_rd(kvm, pci_config_address, data, size);
+
+       return true;
+}
+
+static struct ioport_operations pci_config_data_ops = {
+       .io_in  = pci_config_data_in,
+       .io_out = pci_config_data_out,
+};
+
+void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size)
+{
+       u8 dev_num;
+
+       dev_num = addr.device_number;
+
+       if (pci_device_exists(0, dev_num, 0)) {
+               unsigned long offset;
+
+               offset = addr.w & 0xff;
+               if (offset < sizeof(struct pci_device_header)) {
+                       void *p = pci_devices[dev_num];
+                       u8 bar = (offset - PCI_BAR_OFFSET(0)) / (sizeof(u32));
+                       u32 sz = PCI_IO_SIZE;
+
+                       if (bar < 6 && pci_devices[dev_num]->bar_size[bar])
+                               sz = pci_devices[dev_num]->bar_size[bar];
+
+                       /*
+                        * If the kernel masks the BAR it would expect to find the
+                        * size of the BAR there next time it reads from it.
+                        * When the kernel got the size it would write the address
+                        * back.
+                        */
+                       if (*(u32 *)(p + offset)) {
+                               /* See if kernel tries to mask one of the BARs */
+                               if ((offset >= PCI_BAR_OFFSET(0)) &&
+                                   (offset <= PCI_BAR_OFFSET(6)) &&
+                                   (ioport__read32(data)  == 0xFFFFFFFF))
+                                       memcpy(p + offset, &sz, sizeof(sz));
+                                   else
+                                       memcpy(p + offset, data, size);
+                       }
+               }
+       }
+}
+
+void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size)
+{
+       u8 dev_num;
+
+       dev_num = addr.device_number;
+
+       if (pci_device_exists(0, dev_num, 0)) {
+               unsigned long offset;
+
+               offset = addr.w & 0xff;
+               if (offset < sizeof(struct pci_device_header)) {
+                       void *p = pci_devices[dev_num];
+
+                       memcpy(data, p + offset, size);
+               } else {
+                       memset(data, 0x00, size);
+               }
+       } else {
+               memset(data, 0xff, size);
+       }
+}
+
+int pci__register(struct pci_device_header *dev, u8 dev_num)
+{
+       if (dev_num >= PCI_MAX_DEVICES)
+               return -ENOSPC;
+
+       pci_devices[dev_num] = dev;
+
+       return 0;
+}
+
+struct pci_device_header *pci__find_dev(u8 dev_num)
+{
+       if (dev_num >= PCI_MAX_DEVICES)
+               return ERR_PTR(-EOVERFLOW);
+
+       return pci_devices[dev_num];
+}
+
+int pci__init(struct kvm *kvm)
+{
+       int r;
+
+       r = ioport__register(PCI_CONFIG_DATA + 0, &pci_config_data_ops, 4, NULL);
+       if (r < 0)
+               return r;
+
+       r = ioport__register(PCI_CONFIG_ADDRESS + 0, &pci_config_address_ops, 4, NULL);
+       if (r < 0) {
+               ioport__unregister(PCI_CONFIG_DATA);
+               return r;
+       }
+
+       return 0;
+}
+
+int pci__exit(struct kvm *kvm)
+{
+       ioport__unregister(PCI_CONFIG_DATA);
+       ioport__unregister(PCI_CONFIG_ADDRESS);
+
+       return 0;
+}
diff --git a/tools/kvm/powerpc/boot.c b/tools/kvm/powerpc/boot.c

new file mode 100644 (file)

index 0000000..2557fc0
--- /dev/null
+++ b/tools/kvm/powerpc/boot.c
@@ -0,0 +1,8 @@
+#include "kvm/kvm.h"
+
+#include <stdbool.h>
+
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename)
+{
+       return false;
+}
diff --git a/tools/kvm/powerpc/cpu_info.c b/tools/kvm/powerpc/cpu_info.c

new file mode 100644 (file)

index 0000000..1f440a5
--- /dev/null
+++ b/tools/kvm/powerpc/cpu_info.c
@@ -0,0 +1,195 @@
+/*
+ * PPC CPU identification
+ *
+ * This is a very simple "host CPU info" struct to get us going.
+ * For the little host information we need, I don't want to grub about
+ * parsing stuff in /proc/device-tree so just match host PVR to differentiate
+ * PPC970 and POWER7 (which is all that's currently supported).
+ *
+ * Qemu does something similar but this is MUCH simpler!
+ *
+ * Copyright 2012 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <kvm/kvm.h>
+#include <sys/ioctl.h>
+
+#include "cpu_info.h"
+#include "kvm/util.h"
+
+/* POWER7 */
+
+static struct cpu_info cpu_power7_info = {
+       .name = "POWER7",
+       .tb_freq = 512000000,
+       .d_bsize = 128,
+       .i_bsize = 128,
+       .flags = CPUINFO_FLAG_DFP | CPUINFO_FLAG_VSX | CPUINFO_FLAG_VMX,
+       .mmu_info = {
+               .flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS,
+               .slb_size = 32,
+       },
+};
+
+/* PPC970/G5 */
+
+static struct cpu_info cpu_970_info = {
+       .name = "G5",
+       .tb_freq = 33333333,
+       .d_bsize = 128,
+       .i_bsize = 128,
+       .flags = CPUINFO_FLAG_VMX,
+};
+
+/* This is a default catchall for 'no match' on PVR: */
+static struct cpu_info cpu_dummy_info = { .name = "unknown" };
+
+static struct pvr_info host_pvr_info[] = {
+       { 0xffffffff, 0x0f000003, &cpu_power7_info },
+       { 0xffff0000, 0x003f0000, &cpu_power7_info },
+       { 0xffff0000, 0x004a0000, &cpu_power7_info },
+       { 0xffff0000, 0x00390000, &cpu_970_info },
+       { 0xffff0000, 0x003c0000, &cpu_970_info },
+        { 0xffff0000, 0x00440000, &cpu_970_info },
+        { 0xffff0000, 0x00450000, &cpu_970_info },
+};
+
+/* If we can't query the kernel for supported page sizes assume 4K and 16M */
+static struct kvm_ppc_one_seg_page_size fallback_sps[] = {
+       [0] = {
+               .page_shift = 12,
+               .slb_enc    = 0,
+               .enc =  {
+                       [0] = {
+                               .page_shift = 12,
+                               .pte_enc    = 0,
+                       },
+               },
+       },
+       [1] = {
+               .page_shift = 24,
+               .slb_enc    = 0x100,
+               .enc =  {
+                       [0] = {
+                               .page_shift = 24,
+                               .pte_enc    = 0,
+                       },
+               },
+       },
+};
+
+
+static void setup_mmu_info(struct kvm *kvm, struct cpu_info *cpu_info)
+{
+       static struct kvm_ppc_smmu_info *mmu_info;
+       struct kvm_ppc_one_seg_page_size *sps;
+       int i, j, k, valid;
+
+       if (!kvm__supports_extension(kvm, KVM_CAP_PPC_GET_SMMU_INFO)) {
+               memcpy(&cpu_info->mmu_info.sps, fallback_sps, sizeof(fallback_sps));
+       } else if (ioctl(kvm->vm_fd, KVM_PPC_GET_SMMU_INFO, &cpu_info->mmu_info) < 0) {
+                       die_perror("KVM_PPC_GET_SMMU_INFO failed");
+       }
+
+       mmu_info = &cpu_info->mmu_info;
+
+       if (!(mmu_info->flags & KVM_PPC_PAGE_SIZES_REAL))
+               /* Guest pages are not restricted by the backing page size */
+               return;
+
+       /* Filter based on backing page size */
+
+       for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+               sps = &mmu_info->sps[i];
+
+               if (!sps->page_shift)
+                       break;
+
+               if (kvm->ram_pagesize < (1ul << sps->page_shift)) {
+                       /* Mark the whole segment size invalid */
+                       sps->page_shift = 0;
+                       continue;
+               }
+
+               /* Check each page size for the segment */
+               for (j = 0, valid = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+                       if (!sps->enc[j].page_shift)
+                               break;
+
+                       if (kvm->ram_pagesize < (1ul << sps->enc[j].page_shift))
+                               sps->enc[j].page_shift = 0;
+                       else
+                               valid++;
+               }
+
+               if (!valid) {
+                       /* Mark the whole segment size invalid */
+                       sps->page_shift = 0;
+                       continue;
+               }
+
+               /* Mark any trailing entries invalid if we broke out early */
+               for (k = j; k < KVM_PPC_PAGE_SIZES_MAX_SZ; k++)
+                       sps->enc[k].page_shift = 0;
+
+               /* Collapse holes */
+               for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+                       if (sps->enc[j].page_shift)
+                               continue;
+
+                       for (k = j + 1; k < KVM_PPC_PAGE_SIZES_MAX_SZ; k++) {
+                               if (sps->enc[k].page_shift) {
+                                       sps->enc[j] = sps->enc[k];
+                                       sps->enc[k].page_shift = 0;
+                                       break;
+                               }
+                       }
+               }
+       }
+
+       /* Mark any trailing entries invalid if we broke out early */
+       for (j = i; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++)
+               mmu_info->sps[j].page_shift = 0;
+
+       /* Collapse holes */
+       for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+               if (mmu_info->sps[i].page_shift)
+                       continue;
+
+               for (j = i + 1; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+                       if (mmu_info->sps[j].page_shift) {
+                               mmu_info->sps[i] = mmu_info->sps[j];
+                               mmu_info->sps[j].page_shift = 0;
+                               break;
+                       }
+               }
+       }
+}
+
+struct cpu_info *find_cpu_info(struct kvm *kvm)
+{
+       struct cpu_info *info;
+       unsigned int i;
+       u32 pvr = kvm->pvr;
+
+       for (info = NULL, i = 0; i < ARRAY_SIZE(host_pvr_info); i++) {
+               if ((pvr & host_pvr_info[i].pvr_mask) == host_pvr_info[i].pvr) {
+                       info = host_pvr_info[i].cpu_info;
+                       break;
+               }
+       }
+
+       /* Didn't find anything? Rut-ro. */
+       if (!info) {
+               pr_warning("Host CPU unsupported by kvmtool\n");
+               info = &cpu_dummy_info;
+       }
+
+       setup_mmu_info(kvm, info);
+
+       return info;
+}
diff --git a/tools/kvm/powerpc/cpu_info.h b/tools/kvm/powerpc/cpu_info.h

new file mode 100644 (file)

index 0000000..f61707a
--- /dev/null
+++ b/tools/kvm/powerpc/cpu_info.h
@@ -0,0 +1,42 @@
+/*
+ * PPC CPU identification
+ *
+ * Copyright 2012 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef CPU_INFO_H
+#define CPU_INFO_H
+
+#include <kvm/kvm.h>
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+
+struct cpu_info {
+       const char      *name;
+       u32             tb_freq; /* timebase frequency */
+       u32             d_bsize; /* d-cache block size */
+       u32             i_bsize; /* i-cache block size */
+       u32             flags;
+       struct kvm_ppc_smmu_info mmu_info;
+};
+
+struct pvr_info {
+       u32             pvr_mask;
+       u32             pvr;
+       struct cpu_info *cpu_info;
+};
+
+/* Misc capabilities/CPU properties */
+#define CPUINFO_FLAG_DFP       0x00000001
+#define CPUINFO_FLAG_VMX       0x00000002
+#define CPUINFO_FLAG_VSX       0x00000004
+
+struct cpu_info *find_cpu_info(struct kvm *kvm);
+
+#endif
diff --git a/tools/kvm/powerpc/include/kvm/barrier.h b/tools/kvm/powerpc/include/kvm/barrier.h

new file mode 100644 (file)

index 0000000..dd5115a
--- /dev/null
+++ b/tools/kvm/powerpc/include/kvm/barrier.h
@@ -0,0 +1,6 @@
+#ifndef _KVM_BARRIER_H_
+#define _KVM_BARRIER_H_
+
+#include <asm/barrier.h>
+
+#endif /* _KVM_BARRIER_H_ */
diff --git a/tools/kvm/powerpc/include/kvm/kvm-arch.h b/tools/kvm/powerpc/include/kvm/kvm-arch.h

new file mode 100644 (file)

index 0000000..316fe79
--- /dev/null
+++ b/tools/kvm/powerpc/include/kvm/kvm-arch.h
@@ -0,0 +1,90 @@
+/*
+ * PPC64 architecture-specific definitions
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <time.h>
+
+/*
+ * MMIO lives after RAM, but it'd be nice if it didn't constantly move.
+ * Choose a suitably high address, e.g. 63T...  This limits RAM size.
+ */
+#define PPC_MMIO_START                 0x3F0000000000UL
+#define PPC_MMIO_SIZE                  0x010000000000UL
+
+#define KERNEL_LOAD_ADDR               0x0000000000000000
+#define KERNEL_START_ADDR              0x0000000000000000
+#define KERNEL_SECONDARY_START_ADDR     0x0000000000000060
+#define INITRD_LOAD_ADDR               0x0000000002800000
+
+#define FDT_MAX_SIZE                   0x10000
+#define RTAS_MAX_SIZE                  0x10000
+
+#define TIMEBASE_FREQ                  512000000ULL
+
+#define KVM_MMIO_START                 PPC_MMIO_START
+
+/*
+ * This is the address that pci_get_io_space_block() starts allocating
+ * from.  Note that this is a PCI bus address.
+ */
+#define KVM_PCI_MMIO_AREA              0x1000000
+#define KVM_VIRTIO_MMIO_AREA           0x2000000
+
+struct spapr_phb;
+
+struct kvm {
+       int                     sys_fd;         /* For system ioctls(), i.e. /dev/kvm */
+       int                     vm_fd;          /* For VM ioctls() */
+       timer_t                 timerid;        /* Posix timer for interrupts */
+
+       int                     nrcpus;         /* Number of cpus to run */
+
+       u32                     mem_slots;      /* for KVM_SET_USER_MEMORY_REGION */
+
+       u64                     ram_size;
+       void                    *ram_start;
+       u64                     ram_pagesize;
+
+       u64                     sdr1;
+       u32                     pvr;
+
+       bool                    nmi_disabled;
+
+       bool                    single_step;
+
+       const char              *vmlinux;
+       struct disk_image       **disks;
+       int                     nr_disks;
+       unsigned long           rtas_gra;
+       unsigned long           rtas_size;
+       unsigned long           fdt_gra;
+       unsigned long           initrd_gra;
+       unsigned long           initrd_size;
+       char                    *name;
+       int                     vm_state;
+       struct icp_state        *icp;
+       struct spapr_phb        *phb;
+};
+
+/* Helper for the various bits of code that generate FDT nodes */
+#define _FDT(exp)                                                      \
+       do {                                                            \
+               int ret = (exp);                                        \
+               if (ret < 0) {                                          \
+                       die("Error creating device tree: %s: %s\n",     \
+                           #exp, fdt_strerror(ret));                   \
+               }                                                       \
+       } while (0)
+
+#endif /* KVM__KVM_ARCH_H */
diff --git a/tools/kvm/powerpc/include/kvm/kvm-cpu-arch.h b/tools/kvm/powerpc/include/kvm/kvm-cpu-arch.h

new file mode 100644 (file)

index 0000000..7520c04
--- /dev/null
+++ b/tools/kvm/powerpc/include/kvm/kvm-cpu-arch.h
@@ -0,0 +1,76 @@
+/*
+ * PPC64 cpu-specific definitions
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+/* Architecture-specific kvm_cpu definitions. */
+
+#include <linux/kvm.h> /* for struct kvm_regs */
+#include <stdbool.h>
+#include <pthread.h>
+
+#define MSR_SF         (1UL<<63)
+#define MSR_HV         (1UL<<60)
+#define MSR_VEC                (1UL<<25)
+#define MSR_VSX                (1UL<<23)
+#define MSR_POW                (1UL<<18)
+#define MSR_EE         (1UL<<15)
+#define MSR_PR         (1UL<<14)
+#define MSR_FP         (1UL<<13)
+#define MSR_ME         (1UL<<12)
+#define MSR_FE0                (1UL<<11)
+#define MSR_SE         (1UL<<10)
+#define MSR_BE         (1UL<<9)
+#define MSR_FE1                (1UL<<8)
+#define MSR_IR         (1UL<<5)
+#define MSR_DR         (1UL<<4)
+#define MSR_PMM                (1UL<<2)
+#define MSR_RI         (1UL<<1)
+#define MSR_LE         (1UL<<0)
+
+#define POWER7_EXT_IRQ 0
+
+struct kvm;
+
+struct kvm_cpu {
+       pthread_t               thread;         /* VCPU thread */
+
+       unsigned long           cpu_id;
+
+       struct kvm              *kvm;           /* parent KVM */
+       int                     vcpu_fd;        /* For VCPU ioctls() */
+       struct kvm_run          *kvm_run;
+
+       struct kvm_regs         regs;
+       struct kvm_sregs        sregs;
+       struct kvm_fpu          fpu;
+
+       u8                      is_running;
+       u8                      paused;
+       u8                      needs_nmi;
+       /*
+        * Although PPC KVM doesn't yet support coalesced MMIO, generic code
+        * needs this in our kvm_cpu:
+        */
+       struct kvm_coalesced_mmio_ring  *ring;
+};
+
+void kvm_cpu__irq(struct kvm_cpu *vcpu, int pin, int level);
+
+/* This is never actually called on PPC. */
+static inline bool kvm_cpu__emulate_io(struct kvm *kvm, u16 port, void *data, int direction, int size, u32 count)
+{
+       return false;
+}
+
+bool kvm_cpu__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write);
+
+#endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/tools/kvm/powerpc/ioport.c b/tools/kvm/powerpc/ioport.c

new file mode 100644 (file)

index 0000000..a8e4dc3
--- /dev/null
+++ b/tools/kvm/powerpc/ioport.c
@@ -0,0 +1,18 @@
+/*
+ * PPC64 ioport platform setup.  There isn't any! :-)
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/ioport.h"
+
+#include <stdlib.h>
+
+void ioport__setup_arch(void)
+{
+       /* PPC has no legacy ioports to set up */
+}
diff --git a/tools/kvm/powerpc/irq.c b/tools/kvm/powerpc/irq.c

new file mode 100644 (file)

index 0000000..7da4012
--- /dev/null
+++ b/tools/kvm/powerpc/irq.c
@@ -0,0 +1,75 @@
+/*
+ * PPC64 IRQ routines
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/kvm.h>
+#include <sys/ioctl.h>
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "kvm/pci.h"
+
+#include "xics.h"
+#include "spapr_pci.h"
+
+#define XICS_IRQS               1024
+
+/*
+ * FIXME: The code in this file assumes an SPAPR guest, using XICS.  Make
+ * generic & cope with multiple PPC platform types.
+ */
+
+static int pci_devs = 0;
+
+int irq__register_device(u32 dev, u8 *num, u8 *pin, u8 *line)
+{
+       if (pci_devs >= PCI_MAX_DEVICES)
+               die("Hit PCI device limit!\n");
+
+       *num = pci_devs++;
+
+       *pin = 1;
+       /*
+        * Have I said how nasty I find this?  Line should be dontcare... PHB
+        * should determine which CPU/XICS IRQ to fire.
+        */
+       *line = xics_alloc_irqnum();
+       return 0;
+}
+
+int irq__init(struct kvm *kvm)
+{
+       /*
+        * kvm->nr_cpus is now valid; for /now/, pass
+        * this to xics_system_init(), which assumes servers
+        * are numbered 0..nrcpus.  This may not really be true,
+        * but it is OK currently.
+        */
+       kvm->icp = xics_system_init(XICS_IRQS, kvm->nrcpus);
+       return 0;
+}
+
+int irq__exit(struct kvm *kvm)
+{
+       return 0;
+}
+
+int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg)
+{
+       die(__FUNCTION__);
+       return 0;
+}
diff --git a/tools/kvm/powerpc/kvm-cpu.c b/tools/kvm/powerpc/kvm-cpu.c

new file mode 100644 (file)

index 0000000..97fc759
--- /dev/null
+++ b/tools/kvm/powerpc/kvm-cpu.c
@@ -0,0 +1,293 @@
+/*
+ * PPC64 processor support
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/symbol.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include "spapr.h"
+#include "spapr_pci.h"
+#include "xics.h"
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <assert.h>
+
+static int debug_fd;
+
+void kvm_cpu__set_debug_fd(int fd)
+{
+       debug_fd = fd;
+}
+
+int kvm_cpu__get_debug_fd(void)
+{
+       return debug_fd;
+}
+
+static struct kvm_cpu *kvm_cpu__new(struct kvm *kvm)
+{
+       struct kvm_cpu *vcpu;
+
+       vcpu            = calloc(1, sizeof *vcpu);
+       if (!vcpu)
+               return NULL;
+
+       vcpu->kvm       = kvm;
+
+       return vcpu;
+}
+
+void kvm_cpu__delete(struct kvm_cpu *vcpu)
+{
+       free(vcpu);
+}
+
+struct kvm_cpu *kvm_cpu__init(struct kvm *kvm, unsigned long cpu_id)
+{
+       struct kvm_cpu *vcpu;
+       int mmap_size;
+       struct kvm_enable_cap papr_cap = { .cap = KVM_CAP_PPC_PAPR };
+
+       vcpu            = kvm_cpu__new(kvm);
+       if (!vcpu)
+               return NULL;
+
+       vcpu->cpu_id    = cpu_id;
+
+       vcpu->vcpu_fd = ioctl(vcpu->kvm->vm_fd, KVM_CREATE_VCPU, cpu_id);
+       if (vcpu->vcpu_fd < 0)
+               die_perror("KVM_CREATE_VCPU ioctl");
+
+       mmap_size = ioctl(vcpu->kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
+       if (mmap_size < 0)
+               die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl");
+
+       vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED, vcpu->vcpu_fd, 0);
+       if (vcpu->kvm_run == MAP_FAILED)
+               die("unable to mmap vcpu fd");
+
+       if (ioctl(vcpu->vcpu_fd, KVM_ENABLE_CAP, &papr_cap) < 0)
+               die("unable to enable PAPR capability");
+
+       /*
+        * We start all CPUs, directing non-primary threads into the kernel's
+        * secondary start point.  When we come to support SLOF, we will start
+        * only one and SLOF will RTAS call us to ask for others to be
+        * started.  (FIXME: make more generic & interface with whichever
+        * firmware a platform may be using.)
+        */
+       vcpu->is_running = true;
+
+       /* Register with IRQ controller (FIXME, assumes XICS) */
+       xics_cpu_register(vcpu);
+
+       return vcpu;
+}
+
+static void kvm_cpu__setup_fpu(struct kvm_cpu *vcpu)
+{
+       /* Don't have to do anything, there's no expected FPU state. */
+}
+
+static void kvm_cpu__setup_regs(struct kvm_cpu *vcpu)
+{
+       /*
+        * FIXME: This assumes PPC64 and Linux guest.  It doesn't use the
+        * OpenFirmware entry method, but instead the "embedded" entry which
+        * passes the FDT address directly.
+        */
+       struct kvm_regs *r = &vcpu->regs;
+
+       if (vcpu->cpu_id == 0) {
+               r->pc = KERNEL_START_ADDR;
+               r->gpr[3] = vcpu->kvm->fdt_gra;
+               r->gpr[5] = 0;
+       } else {
+               r->pc = KERNEL_SECONDARY_START_ADDR;
+               r->gpr[3] = vcpu->cpu_id;
+       }
+       r->msr = 0x8000000000001000UL; /* 64bit, non-HV, ME */
+
+       if (ioctl(vcpu->vcpu_fd, KVM_SET_REGS, &vcpu->regs) < 0)
+               die_perror("KVM_SET_REGS failed");
+}
+
+static void kvm_cpu__setup_sregs(struct kvm_cpu *vcpu)
+{
+       /*
+        * Some sregs setup to initialise SDR1/PVR/HIOR on PPC64 SPAPR
+        * platforms using PR KVM.  (Technically, this is all ignored on
+        * SPAPR HV KVM.)  Different setup is required for non-PV non-SPAPR
+        * platforms!  (FIXME.)
+        */
+       struct kvm_sregs sregs;
+       struct kvm_one_reg reg = {};
+       u64 value;
+
+       if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &sregs) < 0)
+               die("KVM_GET_SREGS failed");
+
+       sregs.u.s.sdr1 = vcpu->kvm->sdr1;
+       sregs.pvr = vcpu->kvm->pvr;
+
+       if (ioctl(vcpu->vcpu_fd, KVM_SET_SREGS, &sregs) < 0)
+               die("KVM_SET_SREGS failed");
+
+       reg.id = KVM_REG_PPC_HIOR;
+       value = 0;
+       reg.addr = (u64)&value;
+       if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+               die("KVM_SET_ONE_REG failed");
+}
+
+/**
+ * kvm_cpu__reset_vcpu - reset virtual CPU to a known state
+ */
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+       kvm_cpu__setup_regs(vcpu);
+       kvm_cpu__setup_sregs(vcpu);
+       kvm_cpu__setup_fpu(vcpu);
+}
+
+/* kvm_cpu__irq - set KVM's IRQ flag on this vcpu */
+void kvm_cpu__irq(struct kvm_cpu *vcpu, int pin, int level)
+{
+       unsigned int virq = level ? KVM_INTERRUPT_SET_LEVEL : KVM_INTERRUPT_UNSET;
+
+       /* FIXME: POWER-specific */
+       if (pin != POWER7_EXT_IRQ)
+               return;
+       if (ioctl(vcpu->vcpu_fd, KVM_INTERRUPT, &virq) < 0)
+               pr_warning("Could not KVM_INTERRUPT.");
+}
+
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu)
+{
+}
+
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
+{
+       bool ret = true;
+       struct kvm_run *run = vcpu->kvm_run;
+       switch(run->exit_reason) {
+       case KVM_EXIT_PAPR_HCALL:
+               run->papr_hcall.ret = spapr_hypercall(vcpu, run->papr_hcall.nr,
+                                                     (target_ulong*)run->papr_hcall.args);
+               break;
+       default:
+               ret = false;
+       }
+       return ret;
+}
+
+bool kvm_cpu__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+       /*
+        * FIXME: This function will need to be split in order to support
+        * various PowerPC platforms/PHB types, etc.  It currently assumes SPAPR
+        * PPC64 guest.
+        */
+       bool ret = false;
+
+       if ((phys_addr >= SPAPR_PCI_WIN_START) &&
+           (phys_addr < SPAPR_PCI_WIN_END)) {
+               ret = spapr_phb_mmio(kvm, phys_addr, data, len, is_write);
+       } else {
+               pr_warning("MMIO %s unknown address %llx (size %d)!\n",
+                          is_write ? "write to" : "read from",
+                          phys_addr, len);
+       }
+       return ret;
+}
+
+#define CONDSTR_BIT(m, b) (((m) & MSR_##b) ? #b" " : "")
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+       struct kvm_regs regs;
+       struct kvm_sregs sregs;
+       int r;
+
+       if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &regs) < 0)
+               die("KVM_GET_REGS failed");
+        if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &sregs) < 0)
+               die("KVM_GET_SREGS failed");
+
+       dprintf(debug_fd, "\n Registers:\n");
+       dprintf(debug_fd, " NIP:   %016llx  MSR:   %016llx "
+               "( %s%s%s%s%s%s%s%s%s%s%s%s)\n",
+               regs.pc, regs.msr,
+               CONDSTR_BIT(regs.msr, SF),
+               CONDSTR_BIT(regs.msr, HV), /* ! */
+               CONDSTR_BIT(regs.msr, VEC),
+               CONDSTR_BIT(regs.msr, VSX),
+               CONDSTR_BIT(regs.msr, EE),
+               CONDSTR_BIT(regs.msr, PR),
+               CONDSTR_BIT(regs.msr, FP),
+               CONDSTR_BIT(regs.msr, ME),
+               CONDSTR_BIT(regs.msr, IR),
+               CONDSTR_BIT(regs.msr, DR),
+               CONDSTR_BIT(regs.msr, RI),
+               CONDSTR_BIT(regs.msr, LE));
+       dprintf(debug_fd, " CTR:   %016llx  LR:    %016llx  CR:   %08llx\n",
+               regs.ctr, regs.lr, regs.cr);
+       dprintf(debug_fd, " SRR0:  %016llx  SRR1:  %016llx  XER:  %016llx\n",
+               regs.srr0, regs.srr1, regs.xer);
+       dprintf(debug_fd, " SPRG0: %016llx  SPRG1: %016llx\n",
+               regs.sprg0, regs.sprg1);
+       dprintf(debug_fd, " SPRG2: %016llx  SPRG3: %016llx\n",
+               regs.sprg2, regs.sprg3);
+       dprintf(debug_fd, " SPRG4: %016llx  SPRG5: %016llx\n",
+               regs.sprg4, regs.sprg5);
+       dprintf(debug_fd, " SPRG6: %016llx  SPRG7: %016llx\n",
+               regs.sprg6, regs.sprg7);
+       dprintf(debug_fd, " GPRs:\n ");
+       for (r = 0; r < 32; r++) {
+               dprintf(debug_fd, "%016llx  ", regs.gpr[r]);
+               if ((r & 3) == 3)
+                       dprintf(debug_fd, "\n ");
+       }
+       dprintf(debug_fd, "\n");
+
+       /* FIXME: Assumes SLB-based (book3s) guest */
+       for (r = 0; r < 32; r++) {
+               dprintf(debug_fd, " SLB%02d  %016llx %016llx\n", r,
+                       sregs.u.s.ppc64.slb[r].slbe,
+                       sregs.u.s.ppc64.slb[r].slbv);
+       }
+       dprintf(debug_fd, "----------\n");
+}
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+       if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &vcpu->regs) < 0)
+               die("KVM_GET_REGS failed");
+
+       /* FIXME: Dump/disassemble some code...! */
+
+       dprintf(debug_fd, "\n Stack:\n");
+       dprintf(debug_fd,   " ------\n");
+       /* Only works in real mode: */
+       kvm__dump_mem(vcpu->kvm, vcpu->regs.gpr[1], 32);
+}
+
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
+{
+       /* Does nothing yet */
+}
diff --git a/tools/kvm/powerpc/kvm.c b/tools/kvm/powerpc/kvm.c

new file mode 100644 (file)

index 0000000..83b8edd
--- /dev/null
+++ b/tools/kvm/powerpc/kvm.c
@@ -0,0 +1,529 @@
+/*
+ * PPC64 (SPAPR) platform support
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * Portions of FDT setup borrowed from QEMU, copyright 2010 David Gibson, IBM
+ * Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+#include "libfdt.h"
+#include "cpu_info.h"
+
+#include "spapr.h"
+#include "spapr_hvcons.h"
+#include "spapr_pci.h"
+
+#include <linux/kvm.h>
+
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <asm/unistd.h>
+#include <errno.h>
+
+#include <linux/byteorder.h>
+
+#define HPT_ORDER 24
+
+#define HUGETLBFS_PATH "/var/lib/hugetlbfs/global/pagesize-16MB/"
+
+#define PHANDLE_XICP           0x00001111
+
+static char kern_cmdline[2048];
+
+struct kvm_ext kvm_req_ext[] = {
+       { DEFINE_KVM_EXT(KVM_CAP_PPC_UNSET_IRQ) },
+       { DEFINE_KVM_EXT(KVM_CAP_PPC_IRQ_LEVEL) },
+       { 0, 0 }
+};
+
+static uint32_t mfpvr(void)
+{
+       uint32_t r;
+       asm volatile ("mfpvr %0" : "=r"(r));
+       return r;
+}
+
+bool kvm__arch_cpu_supports_vm(void)
+{
+       return true;
+}
+
+void kvm__init_ram(struct kvm *kvm)
+{
+       u64     phys_start, phys_size;
+       void    *host_mem;
+
+       phys_start = 0;
+       phys_size  = kvm->ram_size;
+       host_mem   = kvm->ram_start;
+
+       /*
+        * We put MMIO at PPC_MMIO_START, high up.  Make sure that this doesn't
+        * crash into the end of RAM -- on PPC64 at least, this is so high
+        * (63TB!) that this is unlikely.
+        */
+       if (phys_size >= PPC_MMIO_START)
+               die("Too much memory (%lld, what a nice problem): "
+                   "overlaps MMIO!\n",
+                   phys_size);
+
+       kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+}
+
+void kvm__arch_set_cmdline(char *cmdline, bool video)
+{
+       /* We don't need anything unusual in here. */
+}
+
+/* Architecture-specific KVM init */
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
+{
+       int cap_ppc_rma;
+       unsigned long hpt;
+
+       kvm->ram_size           = ram_size;
+
+       /* Map "default" hugetblfs path to the standard 16M mount point */
+       if (hugetlbfs_path && !strcmp(hugetlbfs_path, "default"))
+               hugetlbfs_path = HUGETLBFS_PATH;
+
+       kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, kvm->ram_size);
+
+       if (kvm->ram_start == MAP_FAILED)
+               die("Couldn't map %lld bytes for RAM (%d)\n",
+                   kvm->ram_size, errno);
+
+       /* FDT goes at top of memory, RTAS just below */
+       kvm->fdt_gra = kvm->ram_size - FDT_MAX_SIZE;
+       /* FIXME: Not all PPC systems have RTAS */
+       kvm->rtas_gra = kvm->fdt_gra - RTAS_MAX_SIZE;
+       madvise(kvm->ram_start, kvm->ram_size, MADV_MERGEABLE);
+
+       /* FIXME:  SPAPR-PR specific; allocate a guest HPT. */
+       if (posix_memalign((void **)&hpt, (1<<HPT_ORDER), (1<<HPT_ORDER)))
+               die("Can't allocate %d bytes for HPT\n", (1<<HPT_ORDER));
+
+       kvm->sdr1 = ((hpt + 0x3ffffULL) & ~0x3ffffULL) | (HPT_ORDER-18);
+
+       kvm->pvr = mfpvr();
+
+       /* FIXME: This is book3s-specific */
+       cap_ppc_rma = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_PPC_RMA);
+       if (cap_ppc_rma == 2)
+               die("Need contiguous RMA allocation on this hardware, "
+                   "which is not yet supported.");
+
+       /* Do these before FDT setup, IRQ setup, etc. */
+       /* FIXME: SPAPR-specific */
+       hypercall_init();
+       register_core_rtas();
+       /* Now that hypercalls are initialised, register a couple for the console: */
+       spapr_hvcons_init();
+       spapr_create_phb(kvm, "pci", SPAPR_PCI_BUID,
+                        SPAPR_PCI_MEM_WIN_ADDR,
+                        SPAPR_PCI_MEM_WIN_SIZE,
+                        SPAPR_PCI_IO_WIN_ADDR,
+                        SPAPR_PCI_IO_WIN_SIZE);
+}
+
+void kvm__arch_delete_ram(struct kvm *kvm)
+{
+       munmap(kvm->ram_start, kvm->ram_size);
+}
+
+void kvm__irq_trigger(struct kvm *kvm, int irq)
+{
+       kvm__irq_line(kvm, irq, 1);
+       kvm__irq_line(kvm, irq, 0);
+}
+
+void kvm__arch_periodic_poll(struct kvm *kvm)
+{
+       /* FIXME: Should register callbacks to platform-specific polls */
+       spapr_hvcons_poll(kvm);
+}
+
+int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline)
+{
+       void *p;
+       void *k_start;
+       void *i_start;
+       int nr;
+
+       if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+               die_perror("lseek");
+
+       p = k_start = guest_flat_to_host(kvm, KERNEL_LOAD_ADDR);
+
+       while ((nr = read(fd_kernel, p, 65536)) > 0)
+               p += nr;
+
+       pr_info("Loaded kernel to 0x%x (%ld bytes)", KERNEL_LOAD_ADDR, p-k_start);
+
+       if (fd_initrd != -1) {
+               if (lseek(fd_initrd, 0, SEEK_SET) < 0)
+                       die_perror("lseek");
+
+               if (p-k_start > INITRD_LOAD_ADDR)
+                       die("Kernel overlaps initrd!");
+
+               /* Round up kernel size to 8byte alignment, and load initrd right after. */
+               i_start = p = guest_flat_to_host(kvm, INITRD_LOAD_ADDR);
+
+               while (((nr = read(fd_initrd, p, 65536)) > 0) &&
+                      p < (kvm->ram_start + kvm->ram_size))
+                       p += nr;
+
+               if (p >= (kvm->ram_start + kvm->ram_size))
+                       die("initrd too big to contain in guest RAM.\n");
+
+               pr_info("Loaded initrd to 0x%x (%ld bytes)",
+                       INITRD_LOAD_ADDR, p-i_start);
+               kvm->initrd_gra = INITRD_LOAD_ADDR;
+               kvm->initrd_size = p-i_start;
+       } else {
+               kvm->initrd_size = 0;
+       }
+       strncpy(kern_cmdline, kernel_cmdline, 2048);
+       kern_cmdline[2047] = '\0';
+
+       return true;
+}
+
+bool load_bzimage(struct kvm *kvm, int fd_kernel,
+                 int fd_initrd, const char *kernel_cmdline, u16 vidmode)
+{
+       /* We don't support bzImages. */
+       return false;
+}
+
+struct fdt_prop {
+       void *value;
+       int size;
+};
+
+static void generate_segment_page_sizes(struct kvm_ppc_smmu_info *info, struct fdt_prop *prop)
+{
+       struct kvm_ppc_one_seg_page_size *sps;
+       int i, j, size;
+       u32 *p;
+
+       for (size = 0, i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+               sps = &info->sps[i];
+
+               if (sps->page_shift == 0)
+                       break;
+
+               /* page shift, slb enc & count */
+               size += 3;
+
+               for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+                       if (info->sps[i].enc[j].page_shift == 0)
+                               break;
+
+                       /* page shift & pte enc */
+                       size += 2;
+               }
+       }
+
+       if (!size) {
+               prop->value = NULL;
+               prop->size = 0;
+               return;
+       }
+
+       /* Convert size to bytes */
+       prop->size = size * sizeof(u32);
+
+       prop->value = malloc(prop->size);
+       if (!prop->value)
+               die_perror("malloc failed");
+
+       p = (u32 *)prop->value;
+       for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+               sps = &info->sps[i];
+
+               if (sps->page_shift == 0)
+                       break;
+
+               *p++ = sps->page_shift;
+               *p++ = sps->slb_enc;
+
+               for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++)
+                       if (!info->sps[i].enc[j].page_shift)
+                               break;
+
+               *p++ = j;       /* count of enc */
+
+               for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+                       if (!info->sps[i].enc[j].page_shift)
+                               break;
+
+                       *p++ = info->sps[i].enc[j].page_shift;
+                       *p++ = info->sps[i].enc[j].pte_enc;
+               }
+       }
+}
+
+#define SMT_THREADS 4
+
+/*
+ * Set up the FDT for the kernel: This function is currently fairly SPAPR-heavy,
+ * and whilst most PPC targets will require CPU/memory nodes, others like RTAS
+ * should eventually be added separately.
+ */
+static void setup_fdt(struct kvm *kvm)
+{
+       uint64_t        mem_reg_property[] = { 0, cpu_to_be64(kvm->ram_size) };
+       int             smp_cpus = kvm->nrcpus;
+       uint32_t        int_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)};
+       char            hypertas_prop_kvm[] = "hcall-pft\0hcall-term\0"
+               "hcall-dabr\0hcall-interrupt\0hcall-tce\0hcall-vio\0"
+               "hcall-splpar\0hcall-bulk";
+       int             i, j;
+       char            cpu_name[30];
+       u8              staging_fdt[FDT_MAX_SIZE];
+       struct cpu_info *cpu_info = find_cpu_info(kvm);
+       struct fdt_prop segment_page_sizes;
+       u32 segment_sizes_1T[] = {0x1c, 0x28, 0xffffffff, 0xffffffff};
+
+       /* Generate an appropriate DT at kvm->fdt_gra */
+       void *fdt_dest = guest_flat_to_host(kvm, kvm->fdt_gra);
+       void *fdt = staging_fdt;
+
+       _FDT(fdt_create(fdt, FDT_MAX_SIZE));
+       _FDT(fdt_finish_reservemap(fdt));
+
+       _FDT(fdt_begin_node(fdt, ""));
+
+       _FDT(fdt_property_string(fdt, "device_type", "chrp"));
+       _FDT(fdt_property_string(fdt, "model", "IBM pSeries (kvmtool)"));
+       _FDT(fdt_property_cell(fdt, "#address-cells", 0x2));
+       _FDT(fdt_property_cell(fdt, "#size-cells", 0x2));
+
+       /* RTAS */
+       _FDT(fdt_begin_node(fdt, "rtas"));
+       /* This is what the kernel uses to switch 'We're an LPAR'! */
+        _FDT(fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop_kvm,
+                           sizeof(hypertas_prop_kvm)));
+       _FDT(fdt_property_cell(fdt, "linux,rtas-base", kvm->rtas_gra));
+       _FDT(fdt_property_cell(fdt, "linux,rtas-entry", kvm->rtas_gra));
+       _FDT(fdt_property_cell(fdt, "rtas-size", kvm->rtas_size));
+       /* Now add properties for all RTAS tokens: */
+       if (spapr_rtas_fdt_setup(kvm, fdt))
+               die("Couldn't create RTAS FDT properties\n");
+
+       _FDT(fdt_end_node(fdt));
+
+       /* /chosen */
+       _FDT(fdt_begin_node(fdt, "chosen"));
+       /* cmdline */
+       _FDT(fdt_property_string(fdt, "bootargs", kern_cmdline));
+       /* Initrd */
+       if (kvm->initrd_size != 0) {
+               uint32_t ird_st_prop = cpu_to_be32(kvm->initrd_gra);
+               uint32_t ird_end_prop = cpu_to_be32(kvm->initrd_gra +
+                                                   kvm->initrd_size);
+               _FDT(fdt_property(fdt, "linux,initrd-start",
+                                  &ird_st_prop, sizeof(ird_st_prop)));
+               _FDT(fdt_property(fdt, "linux,initrd-end",
+                                  &ird_end_prop, sizeof(ird_end_prop)));
+       }
+
+       /*
+        * stdout-path: This is assuming we're using the HV console.  Also, the
+        * address is hardwired until we do a VIO bus.
+        */
+       _FDT(fdt_property_string(fdt, "linux,stdout-path",
+                                "/vdevice/vty@30000000"));
+       _FDT(fdt_end_node(fdt));
+
+       /*
+        * Memory: We don't alloc. a separate RMA yet.  If we ever need to
+        * (CAP_PPC_RMA == 2) then have one memory node for 0->RMAsize, and
+        * another RMAsize->endOfMem.
+        */
+       _FDT(fdt_begin_node(fdt, "memory@0"));
+       _FDT(fdt_property_string(fdt, "device_type", "memory"));
+       _FDT(fdt_property(fdt, "reg", mem_reg_property,
+                         sizeof(mem_reg_property)));
+       _FDT(fdt_end_node(fdt));
+
+       generate_segment_page_sizes(&cpu_info->mmu_info, &segment_page_sizes);
+
+       /* CPUs */
+       _FDT(fdt_begin_node(fdt, "cpus"));
+       _FDT(fdt_property_cell(fdt, "#address-cells", 0x1));
+       _FDT(fdt_property_cell(fdt, "#size-cells", 0x0));
+
+       for (i = 0; i < smp_cpus; i += SMT_THREADS) {
+               int32_t pft_size_prop[] = { 0, HPT_ORDER };
+               uint32_t servers_prop[SMT_THREADS];
+               uint32_t gservers_prop[SMT_THREADS * 2];
+               int threads = (smp_cpus - i) >= SMT_THREADS ? SMT_THREADS :
+                       smp_cpus - i;
+
+               sprintf(cpu_name, "PowerPC,%s@%d", cpu_info->name, i);
+               _FDT(fdt_begin_node(fdt, cpu_name));
+               sprintf(cpu_name, "PowerPC,%s", cpu_info->name);
+               _FDT(fdt_property_string(fdt, "name", cpu_name));
+               _FDT(fdt_property_string(fdt, "device_type", "cpu"));
+
+               _FDT(fdt_property_cell(fdt, "reg", i));
+               _FDT(fdt_property_cell(fdt, "cpu-version", kvm->pvr));
+
+               _FDT(fdt_property_cell(fdt, "dcache-block-size", cpu_info->d_bsize));
+               _FDT(fdt_property_cell(fdt, "icache-block-size", cpu_info->i_bsize));
+
+               _FDT(fdt_property_cell(fdt, "timebase-frequency", cpu_info->tb_freq));
+               /* Lies, but safeish lies! */
+               _FDT(fdt_property_cell(fdt, "clock-frequency", 0xddbab200));
+
+               if (cpu_info->mmu_info.slb_size)
+                       _FDT(fdt_property_cell(fdt, "ibm,slb-size", cpu_info->mmu_info.slb_size));
+
+               /*
+                * HPT size is hardwired; KVM currently fixes it at 16MB but the
+                * moment that changes we'll need to read it out of the kernel.
+                */
+               _FDT(fdt_property(fdt, "ibm,pft-size", pft_size_prop,
+                                 sizeof(pft_size_prop)));
+
+               _FDT(fdt_property_string(fdt, "status", "okay"));
+               _FDT(fdt_property(fdt, "64-bit", NULL, 0));
+               /* A server for each thread in this core */
+               for (j = 0; j < SMT_THREADS; j++) {
+                       servers_prop[j] = cpu_to_be32(i+j);
+                       /*
+                        * Hack borrowed from QEMU, direct the group queues back
+                        * to cpu 0:
+                        */
+                       gservers_prop[j*2] = cpu_to_be32(i+j);
+                       gservers_prop[j*2 + 1] = 0;
+               }
+               _FDT(fdt_property(fdt, "ibm,ppc-interrupt-server#s",
+                                  servers_prop, threads * sizeof(uint32_t)));
+               _FDT(fdt_property(fdt, "ibm,ppc-interrupt-gserver#s",
+                                 gservers_prop,
+                                 threads * 2 * sizeof(uint32_t)));
+
+               if (segment_page_sizes.value)
+                       _FDT(fdt_property(fdt, "ibm,segment-page-sizes",
+                                         segment_page_sizes.value,
+                                         segment_page_sizes.size));
+
+               if (cpu_info->mmu_info.flags & KVM_PPC_1T_SEGMENTS)
+                       _FDT(fdt_property(fdt, "ibm,processor-segment-sizes",
+                                         segment_sizes_1T, sizeof(segment_sizes_1T)));
+
+               /* VSX / DFP options: */
+               if (cpu_info->flags & CPUINFO_FLAG_VMX)
+                       _FDT(fdt_property_cell(fdt, "ibm,vmx",
+                                              (cpu_info->flags &
+                                               CPUINFO_FLAG_VSX) ? 2 : 1));
+               if (cpu_info->flags & CPUINFO_FLAG_DFP)
+                       _FDT(fdt_property_cell(fdt, "ibm,dfp", 0x1));
+               _FDT(fdt_end_node(fdt));
+       }
+       _FDT(fdt_end_node(fdt));
+
+       /* IRQ controller */
+       _FDT(fdt_begin_node(fdt, "interrupt-controller@0"));
+
+       _FDT(fdt_property_string(fdt, "device_type",
+                                "PowerPC-External-Interrupt-Presentation"));
+       _FDT(fdt_property_string(fdt, "compatible", "IBM,ppc-xicp"));
+       _FDT(fdt_property_cell(fdt, "reg", 0));
+       _FDT(fdt_property(fdt, "interrupt-controller", NULL, 0));
+       _FDT(fdt_property(fdt, "ibm,interrupt-server-ranges",
+                          int_server_ranges_prop,
+                          sizeof(int_server_ranges_prop)));
+       _FDT(fdt_property_cell(fdt, "#interrupt-cells", 2));
+       _FDT(fdt_property_cell(fdt, "linux,phandle", PHANDLE_XICP));
+       _FDT(fdt_property_cell(fdt, "phandle", PHANDLE_XICP));
+       _FDT(fdt_end_node(fdt));
+
+       /*
+        * VIO: See comment in linux,stdout-path; we don't yet represent a VIO
+        * bus/address allocation so addresses are hardwired here.
+        */
+       _FDT(fdt_begin_node(fdt, "vdevice"));
+       _FDT(fdt_property_cell(fdt, "#address-cells", 0x1));
+       _FDT(fdt_property_cell(fdt, "#size-cells", 0x0));
+       _FDT(fdt_property_string(fdt, "device_type", "vdevice"));
+       _FDT(fdt_property_string(fdt, "compatible", "IBM,vdevice"));
+       _FDT(fdt_begin_node(fdt, "vty@30000000"));
+       _FDT(fdt_property_string(fdt, "name", "vty"));
+       _FDT(fdt_property_string(fdt, "device_type", "serial"));
+       _FDT(fdt_property_string(fdt, "compatible", "hvterm1"));
+       _FDT(fdt_property_cell(fdt, "reg", 0x30000000));
+       _FDT(fdt_end_node(fdt));
+       _FDT(fdt_end_node(fdt));
+
+       /* Finalise: */
+       _FDT(fdt_end_node(fdt)); /* Root node */
+       _FDT(fdt_finish(fdt));
+
+       _FDT(fdt_open_into(fdt, fdt_dest, FDT_MAX_SIZE));
+
+       /* PCI */
+       if (spapr_populate_pci_devices(kvm, PHANDLE_XICP, fdt_dest))
+               die("Fail populating PCI device nodes");
+
+       _FDT(fdt_add_mem_rsv(fdt_dest, kvm->rtas_gra, kvm->rtas_size));
+       _FDT(fdt_pack(fdt_dest));
+
+       free(segment_page_sizes.value);
+}
+
+/**
+ * kvm__arch_setup_firmware
+ */
+int kvm__arch_setup_firmware(struct kvm *kvm)
+{
+       /*
+        * Set up RTAS stub.  All it is is a single hypercall:
+        *  0:   7c 64 1b 78     mr      r4,r3
+        *  4:   3c 60 00 00     lis     r3,0
+        *  8:   60 63 f0 00     ori     r3,r3,61440
+        *  c:   44 00 00 22     sc      1
+        * 10:   4e 80 00 20     blr
+        */
+       uint32_t *rtas = guest_flat_to_host(kvm, kvm->rtas_gra);
+
+       rtas[0] = 0x7c641b78;
+       rtas[1] = 0x3c600000;
+       rtas[2] = 0x6063f000;
+       rtas[3] = 0x44000022;
+       rtas[4] = 0x4e800020;
+       kvm->rtas_size = 20;
+
+       pr_info("Set up %ld bytes of RTAS at 0x%lx\n",
+               kvm->rtas_size, kvm->rtas_gra);
+
+       /* Load SLOF */
+
+       /* Init FDT */
+       setup_fdt(kvm);
+
+       return 0;
+}
+
+int kvm__arch_free_firmware(struct kvm *kvm)
+{
+       return 0;
+}
diff --git a/tools/kvm/powerpc/spapr.h b/tools/kvm/powerpc/spapr.h

new file mode 100644 (file)

index 0000000..0537f88
--- /dev/null
+++ b/tools/kvm/powerpc/spapr.h
@@ -0,0 +1,93 @@
+/*
+ * SPAPR definitions and declarations
+ *
+ * Borrowed heavily from QEMU's spapr.h,
+ * Copyright (c) 2010 David Gibson, IBM Corporation.
+ *
+ * Modifications by Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#if !defined(__HW_SPAPR_H__)
+#define __HW_SPAPR_H__
+
+#include <inttypes.h>
+
+/* We need some of the H_ hcall defs, but they're __KERNEL__ only. */
+#define __KERNEL__
+#include <asm/hvcall.h>
+#undef __KERNEL__
+
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+typedef unsigned long target_ulong;
+typedef uintptr_t target_phys_addr_t;
+
+/*
+ * The hcalls above are standardized in PAPR and implemented by pHyp
+ * as well.
+ *
+ * We also need some hcalls which are specific to qemu / KVM-on-POWER.
+ * So far we just need one for H_RTAS, but in future we'll need more
+ * for extensions like virtio.  We put those into the 0xf000-0xfffc
+ * range which is reserved by PAPR for "platform-specific" hcalls.
+ */
+#define KVMPPC_HCALL_BASE       0xf000
+#define KVMPPC_H_RTAS           (KVMPPC_HCALL_BASE + 0x0)
+#define KVMPPC_HCALL_MAX        KVMPPC_H_RTAS
+
+#define DEBUG_SPAPR_HCALLS
+
+#ifdef DEBUG_SPAPR_HCALLS
+#define hcall_dprintf(fmt, ...) \
+    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
+#else
+#define hcall_dprintf(fmt, ...) \
+    do { } while (0)
+#endif
+
+typedef target_ulong (*spapr_hcall_fn)(struct kvm_cpu *vcpu,
+                                      target_ulong opcode,
+                                       target_ulong *args);
+
+void hypercall_init(void);
+void register_core_rtas(void);
+
+void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn);
+target_ulong spapr_hypercall(struct kvm_cpu *vcpu, target_ulong opcode,
+                             target_ulong *args);
+
+int spapr_rtas_fdt_setup(struct kvm *kvm, void *fdt);
+
+static inline uint32_t rtas_ld(struct kvm *kvm, target_ulong phys, int n)
+{
+       return *((uint32_t *)guest_flat_to_host(kvm, phys + 4*n));
+}
+
+static inline void rtas_st(struct kvm *kvm, target_ulong phys, int n, uint32_t val)
+{
+       *((uint32_t *)guest_flat_to_host(kvm, phys + 4*n)) = val;
+}
+
+typedef void (*spapr_rtas_fn)(struct kvm_cpu *vcpu, uint32_t token,
+                              uint32_t nargs, target_ulong args,
+                              uint32_t nret, target_ulong rets);
+void spapr_rtas_register(const char *name, spapr_rtas_fn fn);
+target_ulong spapr_rtas_call(struct kvm_cpu *vcpu,
+                             uint32_t token, uint32_t nargs, target_ulong args,
+                             uint32_t nret, target_ulong rets);
+
+#define SPAPR_PCI_BUID          0x800000020000001ULL
+#define SPAPR_PCI_MEM_WIN_ADDR  (KVM_MMIO_START + 0xA0000000)
+#define SPAPR_PCI_MEM_WIN_SIZE  0x20000000
+#define SPAPR_PCI_IO_WIN_ADDR   (SPAPR_PCI_MEM_WIN_ADDR + SPAPR_PCI_MEM_WIN_SIZE)
+#define SPAPR_PCI_IO_WIN_SIZE  0x2000000
+
+#define SPAPR_PCI_WIN_START    SPAPR_PCI_MEM_WIN_ADDR
+#define SPAPR_PCI_WIN_END      (SPAPR_PCI_IO_WIN_ADDR + SPAPR_PCI_IO_WIN_SIZE)
+
+#endif /* !defined (__HW_SPAPR_H__) */
diff --git a/tools/kvm/powerpc/spapr_hcall.c b/tools/kvm/powerpc/spapr_hcall.c

new file mode 100644 (file)

index 0000000..ff1d63a
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_hcall.c
@@ -0,0 +1,134 @@
+/*
+ * SPAPR hypercalls
+ *
+ * Borrowed heavily from QEMU's spapr_hcall.c,
+ * Copyright (c) 2010 David Gibson, IBM Corporation.
+ *
+ * Copyright (c) 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "spapr.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+static spapr_hcall_fn papr_hypercall_table[(MAX_HCALL_OPCODE / 4) + 1];
+static spapr_hcall_fn kvmppc_hypercall_table[KVMPPC_HCALL_MAX -
+                                            KVMPPC_HCALL_BASE + 1];
+
+static target_ulong h_set_dabr(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+       /* FIXME:  Implement this for -PR.  (-HV does this in kernel.) */
+       return H_HARDWARE;
+}
+
+static target_ulong h_rtas(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+       target_ulong rtas_r3 = args[0];
+       /*
+        * Pointer read from phys mem; these ptrs cannot be MMIO (!) so just
+        * reference guest RAM directly.
+        */
+       uint32_t token, nargs, nret;
+
+       token = rtas_ld(vcpu->kvm, rtas_r3, 0);
+       nargs = rtas_ld(vcpu->kvm, rtas_r3, 1);
+       nret  = rtas_ld(vcpu->kvm, rtas_r3, 2);
+
+       return spapr_rtas_call(vcpu, token, nargs, rtas_r3 + 12,
+                              nret, rtas_r3 + 12 + 4*nargs);
+}
+
+static target_ulong h_logical_load(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+       /* SLOF will require these, though kernel doesn't. */
+       die(__PRETTY_FUNCTION__);
+       return H_PARAMETER;
+}
+
+static target_ulong h_logical_store(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+       /* SLOF will require these, though kernel doesn't. */
+       die(__PRETTY_FUNCTION__);
+       return H_PARAMETER;
+}
+
+static target_ulong h_logical_icbi(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+       /* KVM will trap this in the kernel.  Die if it misses. */
+       die(__PRETTY_FUNCTION__);
+       return H_SUCCESS;
+}
+
+static target_ulong h_logical_dcbf(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+       /* KVM will trap this in the kernel.  Die if it misses. */
+       die(__PRETTY_FUNCTION__);
+       return H_SUCCESS;
+}
+
+void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn)
+{
+       spapr_hcall_fn *slot;
+
+       if (opcode <= MAX_HCALL_OPCODE) {
+               assert((opcode & 0x3) == 0);
+
+               slot = &papr_hypercall_table[opcode / 4];
+       } else {
+               assert((opcode >= KVMPPC_HCALL_BASE) &&
+                      (opcode <= KVMPPC_HCALL_MAX));
+
+               slot = &kvmppc_hypercall_table[opcode - KVMPPC_HCALL_BASE];
+       }
+
+       assert(!(*slot) || (fn == *slot));
+       *slot = fn;
+}
+
+target_ulong spapr_hypercall(struct kvm_cpu *vcpu, target_ulong opcode,
+                            target_ulong *args)
+{
+       if ((opcode <= MAX_HCALL_OPCODE)
+           && ((opcode & 0x3) == 0)) {
+               spapr_hcall_fn fn = papr_hypercall_table[opcode / 4];
+
+               if (fn) {
+                       return fn(vcpu, opcode, args);
+               }
+       } else if ((opcode >= KVMPPC_HCALL_BASE) &&
+                  (opcode <= KVMPPC_HCALL_MAX)) {
+               spapr_hcall_fn fn = kvmppc_hypercall_table[opcode -
+                                                          KVMPPC_HCALL_BASE];
+
+               if (fn) {
+                       return fn(vcpu, opcode, args);
+               }
+       }
+
+       hcall_dprintf("Unimplemented hcall 0x%lx\n", opcode);
+       return H_FUNCTION;
+}
+
+void hypercall_init(void)
+{
+       /* hcall-dabr */
+       spapr_register_hypercall(H_SET_DABR, h_set_dabr);
+
+       spapr_register_hypercall(H_LOGICAL_CI_LOAD, h_logical_load);
+       spapr_register_hypercall(H_LOGICAL_CI_STORE, h_logical_store);
+       spapr_register_hypercall(H_LOGICAL_CACHE_LOAD, h_logical_load);
+       spapr_register_hypercall(H_LOGICAL_CACHE_STORE, h_logical_store);
+       spapr_register_hypercall(H_LOGICAL_ICBI, h_logical_icbi);
+       spapr_register_hypercall(H_LOGICAL_DCBF, h_logical_dcbf);
+
+       /* KVM-PPC specific hcalls */
+       spapr_register_hypercall(KVMPPC_H_RTAS, h_rtas);
+}
diff --git a/tools/kvm/powerpc/spapr_hvcons.c b/tools/kvm/powerpc/spapr_hvcons.c

new file mode 100644 (file)

index 0000000..511dbe1
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_hvcons.c
@@ -0,0 +1,102 @@
+/*
+ * SPAPR HV console
+ *
+ * Borrowed lightly from QEMU's spapr_vty.c, Copyright (c) 2010 David Gibson,
+ * IBM Corporation.
+ *
+ * Copyright (c) 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/term.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+#include "spapr.h"
+#include "spapr_hvcons.h"
+
+#include <stdio.h>
+#include <sys/uio.h>
+#include <errno.h>
+
+#include <linux/byteorder.h>
+
+union hv_chario {
+       struct {
+               uint64_t char0_7;
+               uint64_t char8_15;
+       } a;
+       uint8_t buf[16];
+};
+
+static unsigned long h_put_term_char(struct kvm_cpu *vcpu, unsigned long opcode, unsigned long *args)
+{
+       /* To do: Read register from args[0], and check it. */
+       unsigned long len = args[1];
+       union hv_chario data;
+       struct iovec iov;
+
+       if (len > 16) {
+               return H_PARAMETER;
+       }
+       data.a.char0_7 = cpu_to_be64(args[2]);
+       data.a.char8_15 = cpu_to_be64(args[3]);
+
+       iov.iov_base = data.buf;
+       iov.iov_len = len;
+       do {
+               int ret;
+
+               ret = term_putc_iov(CONSOLE_HV, &iov, 1, 0);
+               if (ret < 0) {
+                       die("term_putc_iov error %d!\n", errno);
+               }
+               iov.iov_base += ret;
+               iov.iov_len -= ret;
+       } while (iov.iov_len > 0);
+
+       return H_SUCCESS;
+}
+
+
+static unsigned long h_get_term_char(struct kvm_cpu *vcpu, unsigned long opcode, unsigned long *args)
+{
+       /* To do: Read register from args[0], and check it. */
+       unsigned long *len = args + 0;
+       unsigned long *char0_7 = args + 1;
+       unsigned long *char8_15 = args + 2;
+       union hv_chario data;
+       struct iovec iov;
+
+       if (term_readable(CONSOLE_HV, 0)) {
+               iov.iov_base = data.buf;
+               iov.iov_len = 16;
+
+               *len = term_getc_iov(CONSOLE_HV, &iov, 1, 0);
+               *char0_7 = be64_to_cpu(data.a.char0_7);
+               *char8_15 = be64_to_cpu(data.a.char8_15);
+       } else {
+               *len = 0;
+       }
+
+       return H_SUCCESS;
+}
+
+void spapr_hvcons_poll(struct kvm *kvm)
+{
+       if (term_readable(CONSOLE_HV, 0)) {
+               /*
+                * We can inject an IRQ to guest here if we want.  The guest
+                * will happily poll, though, so not required.
+                */
+       }
+}
+
+void spapr_hvcons_init(void)
+{
+       spapr_register_hypercall(H_PUT_TERM_CHAR, h_put_term_char);
+       spapr_register_hypercall(H_GET_TERM_CHAR, h_get_term_char);
+}
diff --git a/tools/kvm/powerpc/spapr_hvcons.h b/tools/kvm/powerpc/spapr_hvcons.h

new file mode 100644 (file)

index 0000000..d3e4414
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_hvcons.h
@@ -0,0 +1,19 @@
+/*
+ * SPAPR HV console
+ *
+ * Copyright (c) 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef spapr_hvcons_H
+#define spapr_hvcons_H
+
+#include "kvm/kvm.h"
+
+void spapr_hvcons_init(void);
+void spapr_hvcons_poll(struct kvm *kvm);
+
+#endif
diff --git a/tools/kvm/powerpc/spapr_pci.c b/tools/kvm/powerpc/spapr_pci.c

new file mode 100644 (file)

index 0000000..f9d29f0
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_pci.c
@@ -0,0 +1,423 @@
+/*
+ * SPAPR PHB emulation, RTAS interface to PCI config space, device tree nodes
+ * for enumerated devices.
+ *
+ * Borrowed heavily from QEMU's spapr_pci.c,
+ * Copyright (c) 2011 Alexey Kardashevskiy, IBM Corporation.
+ * Copyright (c) 2011 David Gibson, IBM Corporation.
+ *
+ * Modifications copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "spapr.h"
+#include "spapr_pci.h"
+#include "kvm/util.h"
+#include "kvm/pci.h"
+#include "libfdt.h"
+
+#include <linux/pci_regs.h>
+#include <linux/byteorder.h>
+
+
+/* #define DEBUG_PHB yes */
+#ifdef DEBUG_PHB
+#define phb_dprintf(fmt, ...)                                  \
+       do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
+#else
+#define phb_dprintf(fmt, ...)                  \
+       do { } while (0)
+#endif
+
+static const uint32_t bars[] = {
+       PCI_BASE_ADDRESS_0, PCI_BASE_ADDRESS_1,
+       PCI_BASE_ADDRESS_2, PCI_BASE_ADDRESS_3,
+       PCI_BASE_ADDRESS_4, PCI_BASE_ADDRESS_5
+       /*, PCI_ROM_ADDRESS*/
+};
+
+#define PCI_NUM_REGIONS                7
+
+/* Macros to operate with address in OF binding to PCI */
+#define b_x(x, p, l)   (((x) & ((1<<(l))-1)) << (p))
+#define b_n(x)         b_x((x), 31, 1) /* 0 if relocatable */
+#define b_p(x)         b_x((x), 30, 1) /* 1 if prefetchable */
+#define b_t(x)         b_x((x), 29, 1) /* 1 if the address is aliased */
+#define b_ss(x)                b_x((x), 24, 2) /* the space code */
+#define b_bbbbbbbb(x)  b_x((x), 16, 8) /* bus number */
+#define b_ddddd(x)     b_x((x), 11, 5) /* device number */
+#define b_fff(x)       b_x((x), 8, 3)  /* function number */
+#define b_rrrrrrrr(x)  b_x((x), 0, 8)  /* register number */
+
+#define SS_M64         3
+#define SS_M32         2
+#define SS_IO          1
+#define SS_CONFIG      0
+
+
+static struct spapr_phb phb;
+
+
+static void rtas_ibm_read_pci_config(struct kvm_cpu *vcpu,
+                                    uint32_t token, uint32_t nargs,
+                                    target_ulong args,
+                                    uint32_t nret, target_ulong rets)
+{
+       uint32_t val = 0;
+       uint64_t buid = ((uint64_t)rtas_ld(vcpu->kvm, args, 1) << 32) | rtas_ld(vcpu->kvm, args, 2);
+       union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+       struct pci_device_header *dev = pci__find_dev(addr.device_number);
+       uint32_t size = rtas_ld(vcpu->kvm, args, 3);
+
+       if (buid != phb.buid || !dev || (size > 4)) {
+               phb_dprintf("- cfgRd buid 0x%lx cfg addr 0x%x size %d not found\n",
+                           buid, addr.w, size);
+
+               rtas_st(vcpu->kvm, rets, 0, -1);
+               return;
+       }
+       pci__config_rd(vcpu->kvm, addr, &val, size);
+       /* It appears this wants a byteswapped result... */
+       switch (size) {
+       case 4:
+               val = le32_to_cpu(val);
+               break;
+       case 2:
+               val = le16_to_cpu(val>>16);
+               break;
+       case 1:
+               val = val >> 24;
+               break;
+       }
+       phb_dprintf("- cfgRd buid 0x%lx addr 0x%x (/%d): b%d,d%d,f%d,r0x%x, val 0x%x\n",
+                   buid, addr.w, size, addr.bus_number, addr.device_number, addr.function_number,
+                   addr.register_number, val);
+
+       rtas_st(vcpu->kvm, rets, 0, 0);
+       rtas_st(vcpu->kvm, rets, 1, val);
+}
+
+static void rtas_read_pci_config(struct kvm_cpu *vcpu,
+                                uint32_t token, uint32_t nargs,
+                                target_ulong args,
+                                uint32_t nret, target_ulong rets)
+{
+       uint32_t val;
+       union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+       struct pci_device_header *dev = pci__find_dev(addr.device_number);
+       uint32_t size = rtas_ld(vcpu->kvm, args, 1);
+
+       if (!dev || (size > 4)) {
+               rtas_st(vcpu->kvm, rets, 0, -1);
+               return;
+       }
+       pci__config_rd(vcpu->kvm, addr, &val, size);
+       switch (size) {
+       case 4:
+               val = le32_to_cpu(val);
+               break;
+       case 2:
+               val = le16_to_cpu(val>>16); /* We're yuck-endian. */
+               break;
+       case 1:
+               val = val >> 24;
+               break;
+       }
+       phb_dprintf("- cfgRd addr 0x%x size %d, val 0x%x\n", addr.w, size, val);
+       rtas_st(vcpu->kvm, rets, 0, 0);
+       rtas_st(vcpu->kvm, rets, 1, val);
+}
+
+static void rtas_ibm_write_pci_config(struct kvm_cpu *vcpu,
+                                     uint32_t token, uint32_t nargs,
+                                     target_ulong args,
+                                     uint32_t nret, target_ulong rets)
+{
+       uint64_t buid = ((uint64_t)rtas_ld(vcpu->kvm, args, 1) << 32) | rtas_ld(vcpu->kvm, args, 2);
+       union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+       struct pci_device_header *dev = pci__find_dev(addr.device_number);
+       uint32_t size = rtas_ld(vcpu->kvm, args, 3);
+       uint32_t val = rtas_ld(vcpu->kvm, args, 4);
+
+       if (buid != phb.buid || !dev || (size > 4)) {
+               phb_dprintf("- cfgWr buid 0x%lx cfg addr 0x%x/%d error (val 0x%x)\n",
+                           buid, addr.w, size, val);
+
+               rtas_st(vcpu->kvm, rets, 0, -1);
+               return;
+       }
+       phb_dprintf("- cfgWr buid 0x%lx addr 0x%x (/%d): b%d,d%d,f%d,r0x%x, val 0x%x\n",
+                   buid, addr.w, size, addr.bus_number, addr.device_number, addr.function_number,
+                   addr.register_number, val);
+       switch (size) {
+       case 4:
+               val = le32_to_cpu(val);
+               break;
+       case 2:
+               val = le16_to_cpu(val) << 16;
+               break;
+       case 1:
+               val = val >> 24;
+               break;
+       }
+       pci__config_wr(vcpu->kvm, addr, &val, size);
+       rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+static void rtas_write_pci_config(struct kvm_cpu *vcpu,
+                                 uint32_t token, uint32_t nargs,
+                                 target_ulong args,
+                                 uint32_t nret, target_ulong rets)
+{
+       union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+       struct pci_device_header *dev = pci__find_dev(addr.device_number);
+       uint32_t size = rtas_ld(vcpu->kvm, args, 1);
+       uint32_t val = rtas_ld(vcpu->kvm, args, 2);
+
+       if (!dev || (size > 4)) {
+               rtas_st(vcpu->kvm, rets, 0, -1);
+               return;
+       }
+
+       phb_dprintf("- cfgWr addr 0x%x (/%d): b%d,d%d,f%d,r0x%x, val 0x%x\n",
+                   addr.w, size, addr.bus_number, addr.device_number, addr.function_number,
+                   addr.register_number, val);
+       switch (size) {
+       case 4:
+               val = le32_to_cpu(val);
+               break;
+       case 2:
+               val = le16_to_cpu(val) << 16;
+               break;
+       case 1:
+               val = val >> 24;
+               break;
+       }
+       pci__config_wr(vcpu->kvm, addr, &val, size);
+       rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+void spapr_create_phb(struct kvm *kvm,
+                     const char *busname, uint64_t buid,
+                     uint64_t mem_win_addr, uint64_t mem_win_size,
+                     uint64_t io_win_addr, uint64_t io_win_size)
+{
+       /*
+        * Since kvmtool doesn't really have any concept of buses etc.,
+        * there's nothing to register here.  Just register RTAS.
+        */
+       spapr_rtas_register("read-pci-config", rtas_read_pci_config);
+       spapr_rtas_register("write-pci-config", rtas_write_pci_config);
+       spapr_rtas_register("ibm,read-pci-config", rtas_ibm_read_pci_config);
+       spapr_rtas_register("ibm,write-pci-config", rtas_ibm_write_pci_config);
+
+       phb.buid = buid;
+       phb.mem_addr = mem_win_addr;
+       phb.mem_size = mem_win_size;
+       phb.io_addr  = io_win_addr;
+       phb.io_size  = io_win_size;
+
+       kvm->phb = &phb;
+}
+
+static uint32_t bar_to_ss(unsigned long bar)
+{
+       if ((bar & PCI_BASE_ADDRESS_SPACE) ==
+           PCI_BASE_ADDRESS_SPACE_IO)
+               return SS_IO;
+       else if (bar & PCI_BASE_ADDRESS_MEM_TYPE_64)
+               return SS_M64;
+       else
+               return SS_M32;
+}
+
+static unsigned long bar_to_addr(unsigned long bar)
+{
+       if ((bar & PCI_BASE_ADDRESS_SPACE) ==
+           PCI_BASE_ADDRESS_SPACE_IO)
+               return bar & PCI_BASE_ADDRESS_IO_MASK;
+       else
+               return bar & PCI_BASE_ADDRESS_MEM_MASK;
+}
+
+int spapr_populate_pci_devices(struct kvm *kvm,
+                              uint32_t xics_phandle,
+                              void *fdt)
+{
+       int bus_off, node_off = 0, devid, fn, i, n, devices;
+       char nodename[256];
+       struct {
+               uint32_t hi;
+               uint64_t addr;
+               uint64_t size;
+       } __attribute__((packed)) reg[PCI_NUM_REGIONS + 1],
+                 assigned_addresses[PCI_NUM_REGIONS];
+       uint32_t bus_range[] = { cpu_to_be32(0), cpu_to_be32(0xff) };
+       struct {
+               uint32_t hi;
+               uint64_t child;
+               uint64_t parent;
+               uint64_t size;
+       } __attribute__((packed)) ranges[] = {
+               {
+                       cpu_to_be32(b_ss(1)), cpu_to_be64(0),
+                       cpu_to_be64(phb.io_addr),
+                       cpu_to_be64(phb.io_size),
+               },
+               {
+                       cpu_to_be32(b_ss(2)), cpu_to_be64(0),
+                       cpu_to_be64(phb.mem_addr),
+                       cpu_to_be64(phb.mem_size),
+               },
+       };
+       uint64_t bus_reg[] = { cpu_to_be64(phb.buid), 0 };
+       uint32_t interrupt_map_mask[] = {
+               cpu_to_be32(b_ddddd(-1)|b_fff(-1)), 0x0, 0x0, 0x0};
+       uint32_t interrupt_map[SPAPR_PCI_NUM_LSI][7];
+
+       /* Start populating the FDT */
+       sprintf(nodename, "pci@%" PRIx64, phb.buid);
+       bus_off = fdt_add_subnode(fdt, 0, nodename);
+       if (bus_off < 0) {
+               die("error making bus subnode, %s\n", fdt_strerror(bus_off));
+               return bus_off;
+       }
+
+       /* Write PHB properties */
+       _FDT(fdt_setprop_string(fdt, bus_off, "device_type", "pci"));
+       _FDT(fdt_setprop_string(fdt, bus_off, "compatible", "IBM,Logical_PHB"));
+       _FDT(fdt_setprop_cell(fdt, bus_off, "#address-cells", 0x3));
+       _FDT(fdt_setprop_cell(fdt, bus_off, "#size-cells", 0x2));
+       _FDT(fdt_setprop_cell(fdt, bus_off, "#interrupt-cells", 0x1));
+       _FDT(fdt_setprop(fdt, bus_off, "used-by-rtas", NULL, 0));
+       _FDT(fdt_setprop(fdt, bus_off, "bus-range", &bus_range, sizeof(bus_range)));
+       _FDT(fdt_setprop(fdt, bus_off, "ranges", &ranges, sizeof(ranges)));
+       _FDT(fdt_setprop(fdt, bus_off, "reg", &bus_reg, sizeof(bus_reg)));
+       _FDT(fdt_setprop(fdt, bus_off, "interrupt-map-mask",
+                        &interrupt_map_mask, sizeof(interrupt_map_mask)));
+
+       /* Populate PCI devices and allocate IRQs */
+       devices = 0;
+
+       for (devid = 0; devid < PCI_MAX_DEVICES; devid++) {
+               uint32_t *irqmap = interrupt_map[devices];
+               struct pci_device_header *hdr = pci__find_dev(devid);
+
+               if (!hdr)
+                       continue;
+
+               fn = 0; /* kvmtool doesn't yet do multifunction devices */
+
+               sprintf(nodename, "pci@%u,%u", devid, fn);
+
+               /* Allocate interrupt from the map */
+               if (devid > SPAPR_PCI_NUM_LSI)  {
+                       die("Unexpected behaviour in spapr_populate_pci_devices,"
+                           "wrong devid %u\n", devid);
+               }
+               irqmap[0] = cpu_to_be32(b_ddddd(devid)|b_fff(fn));
+               irqmap[1] = 0;
+               irqmap[2] = 0;
+               irqmap[3] = 0;
+               irqmap[4] = cpu_to_be32(xics_phandle);
+               /*
+                * This is nasty; the PCI devs are set up such that their own
+                * header's irq_line indicates the direct XICS IRQ number to
+                * use.  There REALLY needs to be a hierarchical system in place
+                * to 'raise' an IRQ on the bridge which indexes/looks up which
+                * XICS IRQ to fire.
+                */
+               irqmap[5] = cpu_to_be32(hdr->irq_line);
+               irqmap[6] = cpu_to_be32(0x8);
+
+               /* Add node to FDT */
+               node_off = fdt_add_subnode(fdt, bus_off, nodename);
+               if (node_off < 0) {
+                       die("error making node subnode, %s\n", fdt_strerror(bus_off));
+                       return node_off;
+               }
+
+               _FDT(fdt_setprop_cell(fdt, node_off, "vendor-id",
+                                     le16_to_cpu(hdr->vendor_id)));
+               _FDT(fdt_setprop_cell(fdt, node_off, "device-id",
+                                     le16_to_cpu(hdr->device_id)));
+               _FDT(fdt_setprop_cell(fdt, node_off, "revision-id",
+                                     hdr->revision_id));
+               _FDT(fdt_setprop_cell(fdt, node_off, "class-code",
+                                     hdr->class[0] | (hdr->class[1] << 8) | (hdr->class[2] << 16)));
+               _FDT(fdt_setprop_cell(fdt, node_off, "subsystem-id",
+                                     le16_to_cpu(hdr->subsys_id)));
+               _FDT(fdt_setprop_cell(fdt, node_off, "subsystem-vendor-id",
+                                     le16_to_cpu(hdr->subsys_vendor_id)));
+
+               /* Config space region comes first */
+               reg[0].hi = cpu_to_be32(
+                       b_n(0) |
+                       b_p(0) |
+                       b_t(0) |
+                       b_ss(SS_CONFIG) |
+                       b_bbbbbbbb(0) |
+                       b_ddddd(devid) |
+                       b_fff(fn));
+               reg[0].addr = 0;
+               reg[0].size = 0;
+
+               n = 0;
+               /* Six BARs, no ROM supported, addresses are 32bit */
+               for (i = 0; i < 6; ++i) {
+                       if (0 == hdr->bar[i]) {
+                               continue;
+                       }
+
+                       reg[n+1].hi = cpu_to_be32(
+                               b_n(0) |
+                               b_p(0) |
+                               b_t(0) |
+                               b_ss(bar_to_ss(le32_to_cpu(hdr->bar[i]))) |
+                               b_bbbbbbbb(0) |
+                               b_ddddd(devid) |
+                               b_fff(fn) |
+                               b_rrrrrrrr(bars[i]));
+                       reg[n+1].addr = 0;
+                       reg[n+1].size = cpu_to_be64(hdr->bar_size[i]);
+
+                       assigned_addresses[n].hi = cpu_to_be32(
+                               b_n(1) |
+                               b_p(0) |
+                               b_t(0) |
+                               b_ss(bar_to_ss(le32_to_cpu(hdr->bar[i]))) |
+                               b_bbbbbbbb(0) |
+                               b_ddddd(devid) |
+                               b_fff(fn) |
+                               b_rrrrrrrr(bars[i]));
+
+                       /*
+                        * Writing zeroes to assigned_addresses causes the guest kernel to
+                        * reassign BARs
+                        */
+                       assigned_addresses[n].addr = cpu_to_be64(bar_to_addr(le32_to_cpu(hdr->bar[i])));
+                       assigned_addresses[n].size = reg[n+1].size;
+
+                       ++n;
+               }
+               _FDT(fdt_setprop(fdt, node_off, "reg", reg, sizeof(reg[0])*(n+1)));
+               _FDT(fdt_setprop(fdt, node_off, "assigned-addresses",
+                                assigned_addresses,
+                                sizeof(assigned_addresses[0])*(n)));
+               _FDT(fdt_setprop_cell(fdt, node_off, "interrupts",
+                                     hdr->irq_pin));
+
+               /* We don't set ibm,dma-window property as we don't have an IOMMU. */
+
+               ++devices;
+       }
+
+       /* Write interrupt map */
+       _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map,
+                        devices * sizeof(interrupt_map[0])));
+
+       return 0;
+}
diff --git a/tools/kvm/powerpc/spapr_pci.h b/tools/kvm/powerpc/spapr_pci.h

new file mode 100644 (file)

index 0000000..48b221c
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_pci.h
@@ -0,0 +1,57 @@
+/*
+ * SPAPR PHB definitions
+ *
+ * Modifications by Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef SPAPR_PCI_H
+#define SPAPR_PCI_H
+
+#include "kvm/kvm.h"
+#include "spapr.h"
+#include <inttypes.h>
+
+/* With XICS, we can easily accomodate 1 IRQ per PCI device. */
+
+#define SPAPR_PCI_NUM_LSI 256
+
+struct spapr_phb {
+       uint64_t buid;
+       uint64_t mem_addr;
+       uint64_t mem_size;
+       uint64_t io_addr;
+       uint64_t io_size;
+};
+
+void spapr_create_phb(struct kvm *kvm,
+                      const char *busname, uint64_t buid,
+                      uint64_t mem_win_addr, uint64_t mem_win_size,
+                      uint64_t io_win_addr, uint64_t io_win_size);
+
+int spapr_populate_pci_devices(struct kvm *kvm,
+                               uint32_t xics_phandle,
+                               void *fdt);
+
+static inline bool spapr_phb_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+       if ((phys_addr >= SPAPR_PCI_IO_WIN_ADDR) &&
+           (phys_addr < SPAPR_PCI_IO_WIN_ADDR +
+            SPAPR_PCI_IO_WIN_SIZE)) {
+               return kvm__emulate_io(kvm, phys_addr - SPAPR_PCI_IO_WIN_ADDR,
+                                      data, is_write ? KVM_EXIT_IO_OUT :
+                                      KVM_EXIT_IO_IN,
+                                      len, 1);
+       } else if ((phys_addr >= SPAPR_PCI_MEM_WIN_ADDR) &&
+                  (phys_addr < SPAPR_PCI_MEM_WIN_ADDR +
+                   SPAPR_PCI_MEM_WIN_SIZE)) {
+               return kvm__emulate_mmio(kvm, phys_addr - SPAPR_PCI_MEM_WIN_ADDR,
+                                        data, len, is_write);
+       }
+       return false;
+}
+
+#endif
diff --git a/tools/kvm/powerpc/spapr_rtas.c b/tools/kvm/powerpc/spapr_rtas.c

new file mode 100644 (file)

index 0000000..14a3462
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_rtas.c
@@ -0,0 +1,229 @@
+/*
+ * SPAPR base RTAS calls
+ *
+ * Borrowed heavily from QEMU's spapr_rtas.c
+ * Copyright (c) 2010-2011 David Gibson, IBM Corporation.
+ *
+ * Modifications copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "libfdt.h"
+
+#include "spapr.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+#define TOKEN_BASE      0x2000
+#define TOKEN_MAX       0x100
+
+#define RTAS_CONSOLE
+
+static struct rtas_call {
+       const char *name;
+       spapr_rtas_fn fn;
+} rtas_table[TOKEN_MAX];
+
+struct rtas_call *rtas_next = rtas_table;
+
+
+static void rtas_display_character(struct kvm_cpu *vcpu,
+                                   uint32_t token, uint32_t nargs,
+                                   target_ulong args,
+                                   uint32_t nret, target_ulong rets)
+{
+       char c = rtas_ld(vcpu->kvm, args, 0);
+       term_putc(CONSOLE_HV, &c, 1, 0);
+       rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+#ifdef RTAS_CONSOLE
+static void rtas_put_term_char(struct kvm_cpu *vcpu,
+                              uint32_t token, uint32_t nargs,
+                              target_ulong args,
+                              uint32_t nret, target_ulong rets)
+{
+       char c = rtas_ld(vcpu->kvm, args, 0);
+       term_putc(CONSOLE_HV, &c, 1, 0);
+       rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+static void rtas_get_term_char(struct kvm_cpu *vcpu,
+                              uint32_t token, uint32_t nargs,
+                              target_ulong args,
+                              uint32_t nret, target_ulong rets)
+{
+       int c;
+       if (term_readable(CONSOLE_HV, 0) &&
+           (c = term_getc(CONSOLE_HV, 0)) >= 0) {
+               rtas_st(vcpu->kvm, rets, 0, 0);
+               rtas_st(vcpu->kvm, rets, 1, c);
+       } else {
+               rtas_st(vcpu->kvm, rets, 0, -2);
+       }
+}
+#endif
+
+static void rtas_get_time_of_day(struct kvm_cpu *vcpu,
+                                 uint32_t token, uint32_t nargs,
+                                 target_ulong args,
+                                 uint32_t nret, target_ulong rets)
+{
+       struct tm tm;
+       time_t tnow;
+
+       if (nret != 8) {
+               rtas_st(vcpu->kvm, rets, 0, -3);
+               return;
+       }
+
+       tnow = time(NULL);
+       /* Guest time is currently not offset in any way. */
+       gmtime_r(&tnow, &tm);
+
+       rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+       rtas_st(vcpu->kvm, rets, 1, tm.tm_year + 1900);
+       rtas_st(vcpu->kvm, rets, 2, tm.tm_mon + 1);
+       rtas_st(vcpu->kvm, rets, 3, tm.tm_mday);
+       rtas_st(vcpu->kvm, rets, 4, tm.tm_hour);
+       rtas_st(vcpu->kvm, rets, 5, tm.tm_min);
+       rtas_st(vcpu->kvm, rets, 6, tm.tm_sec);
+       rtas_st(vcpu->kvm, rets, 7, 0);
+}
+
+static void rtas_set_time_of_day(struct kvm_cpu *vcpu,
+                                 uint32_t token, uint32_t nargs,
+                                 target_ulong args,
+                                 uint32_t nret, target_ulong rets)
+{
+       pr_warning("%s called; TOD set ignored.\n", __FUNCTION__);
+}
+
+static void rtas_power_off(struct kvm_cpu *vcpu,
+                           uint32_t token, uint32_t nargs, target_ulong args,
+                           uint32_t nret, target_ulong rets)
+{
+       if (nargs != 2 || nret != 1) {
+               rtas_st(vcpu->kvm, rets, 0, -3);
+               return;
+       }
+       kvm_cpu__reboot();
+}
+
+static void rtas_query_cpu_stopped_state(struct kvm_cpu *vcpu,
+                                         uint32_t token, uint32_t nargs,
+                                         target_ulong args,
+                                         uint32_t nret, target_ulong rets)
+{
+       if (nargs != 1 || nret != 2) {
+               rtas_st(vcpu->kvm, rets, 0, -3);
+               return;
+       }
+
+       /*
+        * Can read id = rtas_ld(vcpu->kvm, args, 0), but
+        * we currently start all CPUs.  So just return true.
+        */
+       rtas_st(vcpu->kvm, rets, 0, 0);
+       rtas_st(vcpu->kvm, rets, 1, 2);
+}
+
+static void rtas_start_cpu(struct kvm_cpu *vcpu,
+                           uint32_t token, uint32_t nargs,
+                           target_ulong args,
+                           uint32_t nret, target_ulong rets)
+{
+       die(__FUNCTION__);
+}
+
+target_ulong spapr_rtas_call(struct kvm_cpu *vcpu,
+                             uint32_t token, uint32_t nargs, target_ulong args,
+                             uint32_t nret, target_ulong rets)
+{
+       if ((token >= TOKEN_BASE)
+           && ((token - TOKEN_BASE) < TOKEN_MAX)) {
+               struct rtas_call *call = rtas_table + (token - TOKEN_BASE);
+
+               if (call->fn) {
+                       call->fn(vcpu, token, nargs, args, nret, rets);
+                       return H_SUCCESS;
+               }
+       }
+
+       /*
+        * HACK: Some Linux early debug code uses RTAS display-character,
+        * but assumes the token value is 0xa (which it is on some real
+        * machines) without looking it up in the device tree.  This
+        * special case makes this work
+        */
+       if (token == 0xa) {
+               rtas_display_character(vcpu, 0xa, nargs, args, nret, rets);
+               return H_SUCCESS;
+       }
+
+       hcall_dprintf("Unknown RTAS token 0x%x\n", token);
+       rtas_st(vcpu->kvm, rets, 0, -3);
+       return H_PARAMETER;
+}
+
+void spapr_rtas_register(const char *name, spapr_rtas_fn fn)
+{
+       assert(rtas_next < (rtas_table + TOKEN_MAX));
+
+       rtas_next->name = name;
+       rtas_next->fn = fn;
+
+       rtas_next++;
+}
+
+/*
+ * This is called from the context of an open /rtas node, in order to add
+ * properties for the rtas call tokens.
+ */
+int spapr_rtas_fdt_setup(struct kvm *kvm, void *fdt)
+{
+       int ret;
+       int i;
+
+       for (i = 0; i < TOKEN_MAX; i++) {
+               struct rtas_call *call = &rtas_table[i];
+
+               if (!call->fn) {
+                       continue;
+               }
+
+               ret = fdt_property_cell(fdt, call->name, i + TOKEN_BASE);
+
+               if (ret < 0) {
+                       pr_warning("Couldn't add rtas token for %s: %s\n",
+                                  call->name, fdt_strerror(ret));
+                       return ret;
+               }
+
+       }
+       return 0;
+}
+
+void register_core_rtas(void)
+{
+       spapr_rtas_register("display-character", rtas_display_character);
+       spapr_rtas_register("get-time-of-day", rtas_get_time_of_day);
+       spapr_rtas_register("set-time-of-day", rtas_set_time_of_day);
+       spapr_rtas_register("power-off", rtas_power_off);
+       spapr_rtas_register("query-cpu-stopped-state",
+                           rtas_query_cpu_stopped_state);
+       spapr_rtas_register("start-cpu", rtas_start_cpu);
+#ifdef RTAS_CONSOLE
+       /* These are unused: We do console I/O via hcalls, not rtas. */
+       spapr_rtas_register("put-term-char", rtas_put_term_char);
+       spapr_rtas_register("get-term-char", rtas_get_term_char);
+#endif
+}
diff --git a/tools/kvm/powerpc/xics.c b/tools/kvm/powerpc/xics.c

new file mode 100644 (file)

index 0000000..2d70d3c
--- /dev/null
+++ b/tools/kvm/powerpc/xics.c
@@ -0,0 +1,514 @@
+/*
+ * PAPR Virtualized Interrupt System, aka ICS/ICP aka xics
+ *
+ * Borrowed heavily from QEMU's xics.c,
+ * Copyright (c) 2010,2011 David Gibson, IBM Corporation.
+ *
+ * Modifications copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "spapr.h"
+#include "xics.h"
+#include "kvm/util.h"
+
+#include <stdio.h>
+#include <malloc.h>
+
+
+/* #define DEBUG_XICS yes */
+#ifdef DEBUG_XICS
+#define xics_dprintf(fmt, ...)                                 \
+       do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
+#else
+#define xics_dprintf(fmt, ...)                 \
+       do { } while (0)
+#endif
+
+/*
+ * ICP: Presentation layer
+ */
+
+struct icp_server_state {
+       uint32_t xirr;
+       uint8_t pending_priority;
+       uint8_t mfrr;
+       struct kvm_cpu *cpu;
+};
+
+#define XICS_IRQ_OFFSET 16
+#define XISR_MASK      0x00ffffff
+#define CPPR_MASK      0xff000000
+
+#define XISR(ss)   (((ss)->xirr) & XISR_MASK)
+#define CPPR(ss)   (((ss)->xirr) >> 24)
+
+struct ics_state;
+
+struct icp_state {
+       unsigned long nr_servers;
+       struct icp_server_state *ss;
+       struct ics_state *ics;
+};
+
+static void ics_reject(struct ics_state *ics, int nr);
+static void ics_resend(struct ics_state *ics);
+static void ics_eoi(struct ics_state *ics, int nr);
+
+static inline void cpu_irq_raise(struct kvm_cpu *vcpu)
+{
+       xics_dprintf("INT1[%p]\n", vcpu);
+       kvm_cpu__irq(vcpu, POWER7_EXT_IRQ, 1);
+}
+
+static inline void cpu_irq_lower(struct kvm_cpu *vcpu)
+{
+       xics_dprintf("INT0[%p]\n", vcpu);
+       kvm_cpu__irq(vcpu, POWER7_EXT_IRQ, 0);
+}
+
+static void icp_check_ipi(struct icp_state *icp, int server)
+{
+       struct icp_server_state *ss = icp->ss + server;
+
+       if (XISR(ss) && (ss->pending_priority <= ss->mfrr)) {
+               return;
+       }
+
+       if (XISR(ss)) {
+               ics_reject(icp->ics, XISR(ss));
+       }
+
+       ss->xirr = (ss->xirr & ~XISR_MASK) | XICS_IPI;
+       ss->pending_priority = ss->mfrr;
+       cpu_irq_raise(ss->cpu);
+}
+
+static void icp_resend(struct icp_state *icp, int server)
+{
+       struct icp_server_state *ss = icp->ss + server;
+
+       if (ss->mfrr < CPPR(ss)) {
+               icp_check_ipi(icp, server);
+       }
+       ics_resend(icp->ics);
+}
+
+static void icp_set_cppr(struct icp_state *icp, int server, uint8_t cppr)
+{
+       struct icp_server_state *ss = icp->ss + server;
+       uint8_t old_cppr;
+       uint32_t old_xisr;
+
+       old_cppr = CPPR(ss);
+       ss->xirr = (ss->xirr & ~CPPR_MASK) | (cppr << 24);
+
+       if (cppr < old_cppr) {
+               if (XISR(ss) && (cppr <= ss->pending_priority)) {
+                       old_xisr = XISR(ss);
+                       ss->xirr &= ~XISR_MASK; /* Clear XISR */
+                       cpu_irq_lower(ss->cpu);
+                       ics_reject(icp->ics, old_xisr);
+               }
+       } else {
+               if (!XISR(ss)) {
+                       icp_resend(icp, server);
+               }
+       }
+}
+
+static void icp_set_mfrr(struct icp_state *icp, int nr, uint8_t mfrr)
+{
+       struct icp_server_state *ss = icp->ss + nr;
+
+       ss->mfrr = mfrr;
+       if (mfrr < CPPR(ss)) {
+               icp_check_ipi(icp, nr);
+       }
+}
+
+static uint32_t icp_accept(struct icp_server_state *ss)
+{
+       uint32_t xirr;
+
+       cpu_irq_lower(ss->cpu);
+       xirr = ss->xirr;
+       ss->xirr = ss->pending_priority << 24;
+       return xirr;
+}
+
+static void icp_eoi(struct icp_state *icp, int server, uint32_t xirr)
+{
+       struct icp_server_state *ss = icp->ss + server;
+
+       ics_eoi(icp->ics, xirr & XISR_MASK);
+       /* Send EOI -> ICS */
+       ss->xirr = (ss->xirr & ~CPPR_MASK) | (xirr & CPPR_MASK);
+       if (!XISR(ss)) {
+               icp_resend(icp, server);
+       }
+}
+
+static void icp_irq(struct icp_state *icp, int server, int nr, uint8_t priority)
+{
+       struct icp_server_state *ss = icp->ss + server;
+       xics_dprintf("icp_irq(nr %d, server %d, prio 0x%x)\n", nr, server, priority);
+       if ((priority >= CPPR(ss))
+           || (XISR(ss) && (ss->pending_priority <= priority))) {
+               xics_dprintf("reject %d, CPPR 0x%x, XISR 0x%x, pprio 0x%x, prio 0x%x\n",
+                            nr, CPPR(ss), XISR(ss), ss->pending_priority, priority);
+               ics_reject(icp->ics, nr);
+       } else {
+               if (XISR(ss)) {
+                       xics_dprintf("reject %d, CPPR 0x%x, XISR 0x%x, pprio 0x%x, prio 0x%x\n",
+                                    nr, CPPR(ss), XISR(ss), ss->pending_priority, priority);
+                       ics_reject(icp->ics, XISR(ss));
+               }
+               ss->xirr = (ss->xirr & ~XISR_MASK) | (nr & XISR_MASK);
+               ss->pending_priority = priority;
+               cpu_irq_raise(ss->cpu);
+       }
+}
+
+/*
+ * ICS: Source layer
+ */
+
+struct ics_irq_state {
+       int server;
+       uint8_t priority;
+       uint8_t saved_priority;
+       int rejected:1;
+       int masked_pending:1;
+};
+
+struct ics_state {
+       unsigned int nr_irqs;
+       unsigned int offset;
+       struct ics_irq_state *irqs;
+       struct icp_state *icp;
+};
+
+static int ics_valid_irq(struct ics_state *ics, uint32_t nr)
+{
+       return (nr >= ics->offset)
+               && (nr < (ics->offset + ics->nr_irqs));
+}
+
+static void ics_set_irq_msi(struct ics_state *ics, int srcno, int val)
+{
+       struct ics_irq_state *irq = ics->irqs + srcno;
+
+       if (val) {
+               if (irq->priority == 0xff) {
+                       xics_dprintf(" irq pri ff, masked pending\n");
+                       irq->masked_pending = 1;
+               } else  {
+                       icp_irq(ics->icp, irq->server, srcno + ics->offset, irq->priority);
+               }
+       }
+}
+
+static void ics_reject_msi(struct ics_state *ics, int nr)
+{
+       struct ics_irq_state *irq = ics->irqs + nr - ics->offset;
+
+       irq->rejected = 1;
+}
+
+static void ics_resend_msi(struct ics_state *ics)
+{
+       unsigned int i;
+
+       for (i = 0; i < ics->nr_irqs; i++) {
+               struct ics_irq_state *irq = ics->irqs + i;
+
+               /* FIXME: filter by server#? */
+               if (irq->rejected) {
+                       irq->rejected = 0;
+                       if (irq->priority != 0xff) {
+                               icp_irq(ics->icp, irq->server, i + ics->offset, irq->priority);
+                       }
+               }
+       }
+}
+
+static void ics_write_xive_msi(struct ics_state *ics, int nr, int server,
+                              uint8_t priority)
+{
+       struct ics_irq_state *irq = ics->irqs + nr - ics->offset;
+
+       irq->server = server;
+       irq->priority = priority;
+       xics_dprintf("ics_write_xive_msi(nr %d, server %d, pri 0x%x)\n", nr, server, priority);
+
+       if (!irq->masked_pending || (priority == 0xff)) {
+               return;
+       }
+
+       irq->masked_pending = 0;
+       icp_irq(ics->icp, server, nr, priority);
+}
+
+static void ics_reject(struct ics_state *ics, int nr)
+{
+       ics_reject_msi(ics, nr);
+}
+
+static void ics_resend(struct ics_state *ics)
+{
+       ics_resend_msi(ics);
+}
+
+static void ics_eoi(struct ics_state *ics, int nr)
+{
+}
+
+/*
+ * Exported functions
+ */
+
+static int allocated_irqnum = XICS_IRQ_OFFSET;
+
+/*
+ * xics_alloc_irqnum(): This is hacky.  The problem boils down to the PCI device
+ * code which just calls kvm__irq_line( .. pcidev->pci_hdr.irq_line ..) at will.
+ * Each PCI device's IRQ line is allocated by irq__register_device() (which
+ * allocates an IRQ AND allocates a.. PCI device num..).
+ *
+ * In future I'd like to at least mimic some kind of 'upstream IRQ controller'
+ * whereby PCI devices let their PHB know when they want to IRQ, and that
+ * percolates up.
+ *
+ * For now, allocate a REAL xics irq number and (via irq__register_device) push
+ * that into the config space. 8 bits only though!
+ */
+int xics_alloc_irqnum(void)
+{
+       int irq = allocated_irqnum++;
+
+       if (irq > 255)
+               die("Huge numbers of IRQs aren't supported with the daft kvmtool IRQ system.");
+
+       return irq;
+}
+
+static target_ulong h_cppr(struct kvm_cpu *vcpu,
+                          target_ulong opcode, target_ulong *args)
+{
+       target_ulong cppr = args[0];
+
+       xics_dprintf("h_cppr(%lx)\n", cppr);
+       icp_set_cppr(vcpu->kvm->icp, vcpu->cpu_id, cppr);
+       return H_SUCCESS;
+}
+
+static target_ulong h_ipi(struct kvm_cpu *vcpu,
+                         target_ulong opcode, target_ulong *args)
+{
+       target_ulong server = args[0];
+       target_ulong mfrr = args[1];
+
+       xics_dprintf("h_ipi(%lx, %lx)\n", server, mfrr);
+       if (server >= vcpu->kvm->icp->nr_servers) {
+               return H_PARAMETER;
+       }
+
+       icp_set_mfrr(vcpu->kvm->icp, server, mfrr);
+       return H_SUCCESS;
+}
+
+static target_ulong h_xirr(struct kvm_cpu *vcpu,
+                          target_ulong opcode, target_ulong *args)
+{
+       uint32_t xirr = icp_accept(vcpu->kvm->icp->ss + vcpu->cpu_id);
+
+       xics_dprintf("h_xirr() = %x\n", xirr);
+       args[0] = xirr;
+       return H_SUCCESS;
+}
+
+static target_ulong h_eoi(struct kvm_cpu *vcpu,
+                         target_ulong opcode, target_ulong *args)
+{
+       target_ulong xirr = args[0];
+
+       xics_dprintf("h_eoi(%lx)\n", xirr);
+       icp_eoi(vcpu->kvm->icp, vcpu->cpu_id, xirr);
+       return H_SUCCESS;
+}
+
+static void rtas_set_xive(struct kvm_cpu *vcpu, uint32_t token,
+                         uint32_t nargs, target_ulong args,
+                         uint32_t nret, target_ulong rets)
+{
+       struct ics_state *ics = vcpu->kvm->icp->ics;
+       uint32_t nr, server, priority;
+
+       if ((nargs != 3) || (nret != 1)) {
+               rtas_st(vcpu->kvm, rets, 0, -3);
+               return;
+       }
+
+       nr = rtas_ld(vcpu->kvm, args, 0);
+       server = rtas_ld(vcpu->kvm, args, 1);
+       priority = rtas_ld(vcpu->kvm, args, 2);
+
+       xics_dprintf("rtas_set_xive(%x,%x,%x)\n", nr, server, priority);
+       if (!ics_valid_irq(ics, nr) || (server >= ics->icp->nr_servers)
+           || (priority > 0xff)) {
+               rtas_st(vcpu->kvm, rets, 0, -3);
+               return;
+       }
+
+       ics_write_xive_msi(ics, nr, server, priority);
+
+       rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+}
+
+static void rtas_get_xive(struct kvm_cpu *vcpu, uint32_t token,
+                         uint32_t nargs, target_ulong args,
+                         uint32_t nret, target_ulong rets)
+{
+       struct ics_state *ics = vcpu->kvm->icp->ics;
+       uint32_t nr;
+
+       if ((nargs != 1) || (nret != 3)) {
+               rtas_st(vcpu->kvm, rets, 0, -3);
+               return;
+       }
+
+       nr = rtas_ld(vcpu->kvm, args, 0);
+
+       if (!ics_valid_irq(ics, nr)) {
+               rtas_st(vcpu->kvm, rets, 0, -3);
+               return;
+       }
+
+       rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+       rtas_st(vcpu->kvm, rets, 1, ics->irqs[nr - ics->offset].server);
+       rtas_st(vcpu->kvm, rets, 2, ics->irqs[nr - ics->offset].priority);
+}
+
+static void rtas_int_off(struct kvm_cpu *vcpu, uint32_t token,
+                        uint32_t nargs, target_ulong args,
+                        uint32_t nret, target_ulong rets)
+{
+       struct ics_state *ics = vcpu->kvm->icp->ics;
+       uint32_t nr;
+
+       if ((nargs != 1) || (nret != 1)) {
+               rtas_st(vcpu->kvm, rets, 0, -3);
+               return;
+       }
+
+       nr = rtas_ld(vcpu->kvm, args, 0);
+
+       if (!ics_valid_irq(ics, nr)) {
+               rtas_st(vcpu->kvm, rets, 0, -3);
+               return;
+       }
+
+       /* ME: QEMU wrote xive_msi here, in #if 0.  Deleted. */
+
+       rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+}
+
+static void rtas_int_on(struct kvm_cpu *vcpu, uint32_t token,
+                       uint32_t nargs, target_ulong args,
+                       uint32_t nret, target_ulong rets)
+{
+       struct ics_state *ics = vcpu->kvm->icp->ics;
+       uint32_t nr;
+
+       if ((nargs != 1) || (nret != 1)) {
+               rtas_st(vcpu->kvm, rets, 0, -3);
+               return;
+       }
+
+       nr = rtas_ld(vcpu->kvm, args, 0);
+
+       if (!ics_valid_irq(ics, nr)) {
+               rtas_st(vcpu->kvm, rets, 0, -3);
+               return;
+       }
+
+       /* ME: QEMU wrote xive_msi here, in #if 0.  Deleted. */
+
+       rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+}
+
+void xics_cpu_register(struct kvm_cpu *vcpu)
+{
+       if (vcpu->cpu_id < vcpu->kvm->icp->nr_servers)
+               vcpu->kvm->icp->ss[vcpu->cpu_id].cpu = vcpu;
+       else
+               die("Setting invalid server for cpuid %ld\n", vcpu->cpu_id);
+}
+
+struct icp_state *xics_system_init(unsigned int nr_irqs, unsigned int nr_cpus)
+{
+       int max_server_num;
+       unsigned int i;
+       struct icp_state *icp;
+       struct ics_state *ics;
+
+       max_server_num = nr_cpus;
+
+       icp = malloc(sizeof(*icp));
+       icp->nr_servers = max_server_num + 1;
+       icp->ss = malloc(icp->nr_servers*sizeof(struct icp_server_state));
+
+       for (i = 0; i < icp->nr_servers; i++) {
+               icp->ss[i].xirr = 0;
+               icp->ss[i].pending_priority = 0;
+               icp->ss[i].cpu = 0;
+               icp->ss[i].mfrr = 0xff;
+       }
+
+       /*
+        * icp->ss[env->cpu_index].cpu is set by CPUs calling in to
+        * xics_cpu_register().
+        */
+
+       ics = malloc(sizeof(*ics));
+       ics->nr_irqs = nr_irqs;
+       ics->offset = XICS_IRQ_OFFSET;
+       ics->irqs = malloc(nr_irqs * sizeof(struct ics_irq_state));
+
+       icp->ics = ics;
+       ics->icp = icp;
+
+       for (i = 0; i < nr_irqs; i++) {
+               ics->irqs[i].server = 0;
+               ics->irqs[i].priority = 0xff;
+               ics->irqs[i].saved_priority = 0xff;
+               ics->irqs[i].rejected = 0;
+               ics->irqs[i].masked_pending = 0;
+       }
+
+       spapr_register_hypercall(H_CPPR, h_cppr);
+       spapr_register_hypercall(H_IPI, h_ipi);
+       spapr_register_hypercall(H_XIRR, h_xirr);
+       spapr_register_hypercall(H_EOI, h_eoi);
+
+       spapr_rtas_register("ibm,set-xive", rtas_set_xive);
+       spapr_rtas_register("ibm,get-xive", rtas_get_xive);
+       spapr_rtas_register("ibm,int-off", rtas_int_off);
+       spapr_rtas_register("ibm,int-on", rtas_int_on);
+
+       return icp;
+}
+
+void kvm__irq_line(struct kvm *kvm, int irq, int level)
+{
+       /*
+        * Route event to ICS, which routes to ICP, which eventually does a
+        * kvm_cpu__irq(vcpu, POWER7_EXT_IRQ, 1)
+        */
+       xics_dprintf("Raising IRQ %d -> %d\n", irq, level);
+       ics_set_irq_msi(kvm->icp->ics, irq - kvm->icp->ics->offset, level);
+}
diff --git a/tools/kvm/powerpc/xics.h b/tools/kvm/powerpc/xics.h

new file mode 100644 (file)

index 0000000..144915b
--- /dev/null
+++ b/tools/kvm/powerpc/xics.h
@@ -0,0 +1,23 @@
+/*
+ * PAPR Virtualized Interrupt System, aka ICS/ICP aka xics
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef XICS_H
+#define XICS_H
+
+#define XICS_IPI        0x2
+
+struct kvm_cpu;
+struct icp_state;
+
+struct icp_state *xics_system_init(unsigned int nr_irqs, unsigned int nr_cpus);
+void xics_cpu_register(struct kvm_cpu *vcpu);
+int xics_alloc_irqnum(void);
+
+#endif
diff --git a/tools/kvm/symbol.c b/tools/kvm/symbol.c

new file mode 100644 (file)

index 0000000..b76d98b
--- /dev/null
+++ b/tools/kvm/symbol.c
@@ -0,0 +1,131 @@
+#include "kvm/symbol.h"
+
+#include "kvm/kvm.h"
+
+#include <linux/err.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <bfd.h>
+
+static bfd *abfd;
+
+int symbol_init(struct kvm *kvm)
+{
+       int ret = 0;
+
+       if (!kvm->vmlinux)
+               return -EINVAL;
+
+       bfd_init();
+
+       abfd = bfd_openr(kvm->vmlinux, NULL);
+       if (abfd == NULL) {
+               bfd_error_type err = bfd_get_error();
+
+               switch (err) {
+               case bfd_error_no_memory:
+                       ret = -ENOMEM;
+                       break;
+               case bfd_error_invalid_target:
+                       ret = -EINVAL;
+                       break;
+               default:
+                       ret = -EFAULT;
+                       break;
+               }
+       }
+
+       return ret;
+}
+
+static asymbol *lookup(asymbol **symbols, int nr_symbols, const char *symbol_name)
+{
+       int i, ret;
+
+       ret = -ENOENT;
+
+       for (i = 0; i < nr_symbols; i++) {
+               asymbol *symbol = symbols[i];
+
+               if (!strcmp(bfd_asymbol_name(symbol), symbol_name))
+                       return symbol;
+       }
+
+       return ERR_PTR(ret);
+}
+
+char *symbol_lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size)
+{
+       const char *filename;
+       bfd_vma sym_offset;
+       bfd_vma sym_start;
+       asection *section;
+       unsigned int line;
+       const char *func;
+       long symtab_size;
+       asymbol *symbol;
+       asymbol **syms;
+       int nr_syms, ret;
+
+       ret = -ENOENT;
+       if (!abfd)
+               goto not_found;
+
+       if (!bfd_check_format(abfd, bfd_object))
+               goto not_found;
+
+       symtab_size = bfd_get_symtab_upper_bound(abfd);
+       if (!symtab_size)
+               goto not_found;
+
+       ret = -ENOMEM;
+       syms = malloc(symtab_size);
+       if (!syms)
+               goto not_found;
+
+       nr_syms = bfd_canonicalize_symtab(abfd, syms);
+
+       ret = -ENOENT;
+       section = bfd_get_section_by_name(abfd, ".debug_aranges");
+       if (!section)
+               goto not_found;
+
+       if (!bfd_find_nearest_line(abfd, section, NULL, addr, &filename, &func, &line))
+               goto not_found;
+
+       if (!func)
+               goto not_found;
+
+       symbol = lookup(syms, nr_syms, func);
+       if (IS_ERR(symbol))
+               goto not_found;
+
+       sym_start = bfd_asymbol_value(symbol);
+
+       sym_offset = addr - sym_start;
+
+       snprintf(sym, size, "%s+%llx (%s:%i)", func, (long long) sym_offset, filename, line);
+
+       sym[size - 1] = '\0';
+
+       free(syms);
+
+       return sym;
+
+not_found:
+       return ERR_PTR(ret);
+}
+
+int symbol_exit(struct kvm *kvm)
+{
+       bfd_boolean ret = TRUE;
+
+       if (abfd)
+               ret = bfd_close(abfd);
+
+       if (ret == TRUE)
+               return 0;
+
+       return -EFAULT;
+}
diff --git a/tools/kvm/term.c b/tools/kvm/term.c

new file mode 100644 (file)

index 0000000..cc0c5a5
--- /dev/null
+++ b/tools/kvm/term.c
@@ -0,0 +1,166 @@
+#include <poll.h>
+#include <stdbool.h>
+#include <termios.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include <signal.h>
+#include <pty.h>
+#include <utmp.h>
+
+#include "kvm/read-write.h"
+#include "kvm/term.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+#define TERM_FD_IN      0
+#define TERM_FD_OUT     1
+
+extern struct kvm *kvm;
+static struct termios  orig_term;
+
+int term_escape_char   = 0x01; /* ctrl-a is used for escape */
+bool term_got_escape   = false;
+
+int active_console;
+
+int term_fds[4][2];
+
+int term_getc(int who, int term)
+{
+       unsigned char c;
+
+       if (who != active_console)
+               return -1;
+       if (read_in_full(term_fds[term][TERM_FD_IN], &c, 1) < 0)
+               return -1;
+
+       if (term_got_escape) {
+               term_got_escape = false;
+               if (c == 'x')
+                       kvm_cpu__reboot();
+               if (c == term_escape_char)
+                       return c;
+       }
+
+       if (c == term_escape_char) {
+               term_got_escape = true;
+               return -1;
+       }
+
+       return c;
+}
+
+int term_putc(int who, char *addr, int cnt, int term)
+{
+       int ret;
+
+       if (who != active_console)
+               return -1;
+
+       while (cnt--) {
+               ret = write(term_fds[term][TERM_FD_OUT], addr++, 1);
+               if (ret < 0)
+                       return 0;
+       }
+
+       return cnt;
+}
+
+int term_getc_iov(int who, struct iovec *iov, int iovcnt, int term)
+{
+       int c;
+
+       if (who != active_console)
+               return 0;
+
+       c = term_getc(who, term);
+
+       if (c < 0)
+               return 0;
+
+       *((char *)iov[TERM_FD_IN].iov_base)     = (char)c;
+
+       return sizeof(char);
+}
+
+int term_putc_iov(int who, struct iovec *iov, int iovcnt, int term)
+{
+       if (who != active_console)
+               return 0;
+
+       return writev(term_fds[term][TERM_FD_OUT], iov, iovcnt);
+}
+
+bool term_readable(int who, int term)
+{
+       struct pollfd pollfd = (struct pollfd) {
+               .fd     = term_fds[term][TERM_FD_IN],
+               .events = POLLIN,
+               .revents = 0,
+       };
+
+       if (who != active_console)
+               return false;
+
+       return poll(&pollfd, 1, 0) > 0;
+}
+
+static void term_cleanup(void)
+{
+       int i;
+
+       for (i = 0; i < 4; i++)
+               tcsetattr(term_fds[i][TERM_FD_IN], TCSANOW, &orig_term);
+}
+
+static void term_sig_cleanup(int sig)
+{
+       term_cleanup();
+       signal(sig, SIG_DFL);
+       raise(sig);
+}
+
+void term_set_tty(int term)
+{
+       struct termios orig_term;
+       int master, slave;
+       char new_pty[PATH_MAX];
+
+       if (tcgetattr(STDIN_FILENO, &orig_term) < 0)
+               die("unable to save initial standard input settings");
+
+       orig_term.c_lflag &= ~(ICANON | ECHO | ISIG);
+
+       if (openpty(&master, &slave, new_pty, &orig_term, NULL) < 0)
+               return;
+
+       close(slave);
+
+       pr_info("Assigned terminal %d to pty %s\n", term, new_pty);
+
+       term_fds[term][TERM_FD_IN] = term_fds[term][TERM_FD_OUT] = master;
+}
+
+void term_init(void)
+{
+       struct termios term;
+       int i;
+
+       if (tcgetattr(STDIN_FILENO, &orig_term) < 0)
+               die("unable to save initial standard input settings");
+
+       term = orig_term;
+       term.c_lflag &= ~(ICANON | ECHO | ISIG);
+       tcsetattr(STDIN_FILENO, TCSANOW, &term);
+
+       for (i = 0; i < 4; i++)
+               if (term_fds[i][TERM_FD_IN] == 0) {
+                       term_fds[i][TERM_FD_IN] = STDIN_FILENO;
+                       term_fds[i][TERM_FD_OUT] = STDOUT_FILENO;
+               }
+
+       signal(SIGTERM, term_sig_cleanup);
+       atexit(term_cleanup);
+}
diff --git a/tools/kvm/tests/Makefile b/tools/kvm/tests/Makefile

new file mode 100644 (file)

index 0000000..cad14ec
--- /dev/null
+++ b/tools/kvm/tests/Makefile
@@ -0,0 +1,19 @@
+all: kernel pit boot
+
+kernel:
+       $(MAKE) -C kernel
+.PHONY: kernel
+
+pit:
+       $(MAKE) -C pit
+.PHONY: pit
+
+boot:
+       $(MAKE) -C boot
+.PHONY: boot
+
+clean:
+       $(MAKE) -C kernel clean
+       $(MAKE) -C pit clean
+       $(MAKE) -C boot clean
+.PHONY: clean
diff --git a/tools/kvm/tests/boot/Makefile b/tools/kvm/tests/boot/Makefile

new file mode 100644 (file)

index 0000000..40cba68
--- /dev/null
+++ b/tools/kvm/tests/boot/Makefile
@@ -0,0 +1,13 @@
+NAME   := init
+
+OBJ    := $(NAME).o
+
+all: $(.o)
+       rm -rf rootfs
+       mkdir rootfs
+       gcc -static init.c -o rootfs/init
+       mkisofs rootfs > boot_test.iso
+
+clean:
+       rm -rf rootfs boot_test.iso
+.PHONY: clean
diff --git a/tools/kvm/tests/boot/init.c b/tools/kvm/tests/boot/init.c

new file mode 100644 (file)

index 0000000..094f8ba
--- /dev/null
+++ b/tools/kvm/tests/boot/init.c
@@ -0,0 +1,11 @@
+#include <linux/reboot.h>
+#include <unistd.h>
+
+int main(int argc, char *argv[])
+{
+       puts("hello, KVM guest!\r");
+
+       reboot(LINUX_REBOOT_CMD_RESTART);
+
+       return 0;
+}
diff --git a/tools/kvm/tests/kernel/.gitignore b/tools/kvm/tests/kernel/.gitignore

new file mode 100644 (file)

index 0000000..d0cd209
--- /dev/null
+++ b/tools/kvm/tests/kernel/.gitignore
@@ -0,0 +1,2 @@
+kernel.bin
+kernel.elf
diff --git a/tools/kvm/tests/kernel/Makefile b/tools/kvm/tests/kernel/Makefile

new file mode 100644 (file)

index 0000000..c7dd8da
--- /dev/null
+++ b/tools/kvm/tests/kernel/Makefile
@@ -0,0 +1,20 @@
+NAME   := kernel
+
+BIN    := $(NAME).bin
+ELF    := $(NAME).elf
+OBJ    := $(NAME).o
+
+all: $(BIN)
+
+$(BIN): $(ELF)
+       objcopy -O binary $< $@
+
+$(ELF): $(OBJ)
+       ld -Ttext=0x00 -nostdlib -static $< -o $@
+
+%.o: %.S
+       gcc -nostdinc -c $< -o $@
+
+clean:
+       rm -f $(BIN) $(ELF) $(OBJ)
+.PHONY: clean
diff --git a/tools/kvm/tests/kernel/README b/tools/kvm/tests/kernel/README

new file mode 100644 (file)

index 0000000..2923777
--- /dev/null
+++ b/tools/kvm/tests/kernel/README
@@ -0,0 +1,16 @@
+Compiling
+---------
+
+You can simply type:
+
+  $ make
+
+to build a 16-bit binary that uses the i8086 instruction set.
+
+Disassembling
+-------------
+
+Use the "-m i8086" command line option with objdump to make sure it knows we're
+dealing with i8086 instruction set:
+
+  $ objdump -d -m i8086 i8086.elf
diff --git a/tools/kvm/tests/kernel/kernel.S b/tools/kvm/tests/kernel/kernel.S

new file mode 100644 (file)

index 0000000..2824b64
--- /dev/null
+++ b/tools/kvm/tests/kernel/kernel.S
@@ -0,0 +1,8 @@
+       .code16gcc
+       .text
+       .globl  _start
+       .type   _start, @function
+_start:
+       # "This is probably the largest possible kernel that is bug free." -- Avi Kivity
+       1:
+       jmp 1b
diff --git a/tools/kvm/tests/pit/.gitignore b/tools/kvm/tests/pit/.gitignore

new file mode 100644 (file)

index 0000000..43f0aa8
--- /dev/null
+++ b/tools/kvm/tests/pit/.gitignore
@@ -0,0 +1,2 @@
+*.bin
+*.elf
diff --git a/tools/kvm/tests/pit/Makefile b/tools/kvm/tests/pit/Makefile

new file mode 100644 (file)

index 0000000..2fae9b2
--- /dev/null
+++ b/tools/kvm/tests/pit/Makefile
@@ -0,0 +1,20 @@
+NAME   := tick
+
+BIN    := $(NAME).bin
+ELF    := $(NAME).elf
+OBJ    := $(NAME).o
+
+all: $(BIN)
+
+$(BIN): $(ELF)
+       objcopy -O binary $< $@
+
+$(ELF): $(OBJ)
+       ld -Ttext=0x00 -nostdlib -static $< -o $@
+
+%.o: %.S
+       gcc -nostdinc -c $< -o $@
+
+clean:
+       rm -f $(BIN) $(ELF) $(OBJ)
+.PHONY: clean
diff --git a/tools/kvm/tests/pit/README b/tools/kvm/tests/pit/README

new file mode 100644 (file)

index 0000000..2923777
--- /dev/null
+++ b/tools/kvm/tests/pit/README
@@ -0,0 +1,16 @@
+Compiling
+---------
+
+You can simply type:
+
+  $ make
+
+to build a 16-bit binary that uses the i8086 instruction set.
+
+Disassembling
+-------------
+
+Use the "-m i8086" command line option with objdump to make sure it knows we're
+dealing with i8086 instruction set:
+
+  $ objdump -d -m i8086 i8086.elf
diff --git a/tools/kvm/tests/pit/tick.S b/tools/kvm/tests/pit/tick.S

new file mode 100644 (file)

index 0000000..b9e5a80
--- /dev/null
+++ b/tools/kvm/tests/pit/tick.S
@@ -0,0 +1,109 @@
+#define IO_PIC         0x20
+#define IRQ_OFFSET     32
+#define IO_PIT         0x40
+#define TIMER_FREQ     1193182
+#define TIMER_DIV(x)   ((TIMER_FREQ+(x)/2)/(x))
+
+/*
+ * hpa noted:
+ *
+ * 0xe0..0xef are "motherboard specific", but 0xe9 is
+ * used for Bochs debugging and 0xed is the Phoenix-reserved
+ * delay port
+ */
+#define DBG_PORT       0xe0
+
+#define TEST_COUNT     0x0200
+
+       .code16gcc
+       .text
+       .globl  _start
+       .type   _start, @function
+_start:
+/*
+ * fill up noop handlers
+ */
+       xorw    %ax, %ax
+       xorw    %di, %di
+       movw    %ax, %es
+       movw    $256, %cx
+fill_noop_idt:
+       movw    $noop_handler, %es:(%di)
+       movw    %cs, %es:2(%di)
+       add     $4, %di
+       loop    fill_noop_idt
+
+set_idt:
+       movw    $timer_isr, %es:(IRQ_OFFSET*4)
+       movw    %cs, %es:(IRQ_OFFSET*4+2)
+
+set_pic:
+       # ICW1
+       mov     $0x11, %al
+       mov     $(IO_PIC), %dx
+       out     %al,%dx
+       # ICW2
+       mov     $(IRQ_OFFSET), %al
+       mov     $(IO_PIC+1), %dx
+       out     %al, %dx
+       # ICW3
+       mov     $0x00, %al
+       mov     $(IO_PIC+1), %dx
+       out     %al, %dx
+       # ICW4
+       mov     $0x3, %al
+       mov     $(IO_PIC+1), %dx
+       out     %al, %dx
+
+set_pit:
+       # set 8254 mode
+       mov     $(IO_PIT+3), %dx
+       mov     $0x34, %al
+       outb    %al, %dx
+       # set 8254 freq 1KHz
+       mov     $(IO_PIT), %dx
+       movb    $(TIMER_DIV(1000) % 256), %al
+       outb    %al, %dx
+       movb    $(TIMER_DIV(1000) / 256), %al
+       outb    %al, %dx
+
+enable_irq0:
+       mov     $0xfe, %al
+       mov     $(IO_PIC+1), %dx
+       out     %al, %dx
+       sti
+loop:
+       1:
+       jmp     1b
+
+test_ok:
+       mov     $0x3f8,%dx
+       cs lea  msg2, %si
+       mov     $(msg2_end-msg2), %cx
+       cs rep/outsb
+
+       /* not a valid port to force exit */
+       outb    %al, $DBG_PORT
+
+timer_isr:
+       cli
+       pushaw
+       pushfw
+       mov     $0x3f8,%dx
+       mov     $0x2e, %al      # .
+       out     %al,%dx
+       decw    count
+       jz      test_ok
+       popfw
+       popaw
+       iretw
+
+noop_handler:
+       iretw
+
+count:
+       .word   TEST_COUNT
+
+msg2:
+       .asciz "\nTest OK\n"
+msg2_end:
diff --git a/tools/kvm/ui/sdl.c b/tools/kvm/ui/sdl.c

new file mode 100644 (file)

index 0000000..708b9a9
--- /dev/null
+++ b/tools/kvm/ui/sdl.c
@@ -0,0 +1,303 @@
+#include "kvm/sdl.h"
+
+#include "kvm/framebuffer.h"
+#include "kvm/i8042.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+#include <SDL/SDL.h>
+#include <pthread.h>
+#include <signal.h>
+
+#define FRAME_RATE             25
+
+#define SCANCODE_UNKNOWN      0
+#define SCANCODE_NORMAL       1
+#define SCANCODE_ESCAPED      2
+#define SCANCODE_KEY_PAUSE    3
+#define SCANCODE_KEY_PRNTSCRN 4
+
+struct set2_scancode {
+       u8 code;
+       u8 type;
+};
+
+#define DEFINE_SC(_code) {\
+       .code = _code,\
+       .type = SCANCODE_NORMAL,\
+}
+
+/* escaped scancodes */
+#define DEFINE_ESC(_code) {\
+       .code = _code,\
+       .type = SCANCODE_ESCAPED,\
+}
+
+static const struct set2_scancode const keymap[256] = {
+       [9]     = DEFINE_SC(0x76),      /* <esc> */
+       [10]    = DEFINE_SC(0x16),      /* 1 */
+       [11]    = DEFINE_SC(0x1e),      /* 2 */
+       [12]    = DEFINE_SC(0x26),      /* 3 */
+       [13]    = DEFINE_SC(0x25),      /* 4 */
+       [14]    = DEFINE_SC(0x2e),      /* 5 */
+       [15]    = DEFINE_SC(0x36),      /* 6 */
+       [16]    = DEFINE_SC(0x3d),      /* 7 */
+       [17]    = DEFINE_SC(0x3e),      /* 8 */
+       [18]    = DEFINE_SC(0x46),      /* 9 */
+       [19]    = DEFINE_SC(0x45),      /* 9 */
+       [20]    = DEFINE_SC(0x4e),      /* - */
+       [21]    = DEFINE_SC(0x55),      /* + */
+       [22]    = DEFINE_SC(0x66),      /* <backspace> */
+       [23]    = DEFINE_SC(0x0d),      /* <tab> */
+       [24]    = DEFINE_SC(0x15),      /* q */
+       [25]    = DEFINE_SC(0x1d),      /* w */
+       [26]    = DEFINE_SC(0x24),      /* e */
+       [27]    = DEFINE_SC(0x2d),      /* r */
+       [28]    = DEFINE_SC(0x2c),      /* t */
+       [29]    = DEFINE_SC(0x35),      /* y */
+       [30]    = DEFINE_SC(0x3c),      /* u */
+       [31]    = DEFINE_SC(0x43),      /* i */
+       [32]    = DEFINE_SC(0x44),      /* o */
+       [33]    = DEFINE_SC(0x4d),      /* p */
+       [34]    = DEFINE_SC(0x54),      /* [ */
+       [35]    = DEFINE_SC(0x5b),      /* ] */
+       [36]    = DEFINE_SC(0x5a),      /* <enter> */
+       [37]    = DEFINE_SC(0x14),      /* <left ctrl> */
+       [38]    = DEFINE_SC(0x1c),      /* a */
+       [39]    = DEFINE_SC(0x1b),      /* s */
+       [40]    = DEFINE_SC(0x23),      /* d */
+       [41]    = DEFINE_SC(0x2b),      /* f */
+       [42]    = DEFINE_SC(0x34),      /* g */
+       [43]    = DEFINE_SC(0x33),      /* h */
+       [44]    = DEFINE_SC(0x3b),      /* j */
+       [45]    = DEFINE_SC(0x42),      /* k */
+       [46]    = DEFINE_SC(0x4b),      /* l */
+       [47]    = DEFINE_SC(0x4c),      /* ; */
+       [48]    = DEFINE_SC(0x52),      /* ' */
+       [49]    = DEFINE_SC(0x0e),      /* ` */
+       [50]    = DEFINE_SC(0x12),      /* <left shift> */
+       [51]    = DEFINE_SC(0x5d),      /* \ */
+       [52]    = DEFINE_SC(0x1a),      /* z */
+       [53]    = DEFINE_SC(0x22),      /* x */
+       [54]    = DEFINE_SC(0x21),      /* c */
+       [55]    = DEFINE_SC(0x2a),      /* v */
+       [56]    = DEFINE_SC(0x32),      /* b */
+       [57]    = DEFINE_SC(0x31),      /* n */
+       [58]    = DEFINE_SC(0x3a),      /* m */
+       [59]    = DEFINE_SC(0x41),      /* < */
+       [60]    = DEFINE_SC(0x49),      /* > */
+       [61]    = DEFINE_SC(0x4a),      /* / */
+       [62]    = DEFINE_SC(0x59),      /* <right shift> */
+       [63]    = DEFINE_SC(0x7c),      /* keypad * */
+       [64]    = DEFINE_SC(0x11),      /* <left alt> */
+       [65]    = DEFINE_SC(0x29),      /* <space> */
+
+       [67]    = DEFINE_SC(0x05),      /* <F1> */
+       [68]    = DEFINE_SC(0x06),      /* <F2> */
+       [69]    = DEFINE_SC(0x04),      /* <F3> */
+       [70]    = DEFINE_SC(0x0c),      /* <F4> */
+       [71]    = DEFINE_SC(0x03),      /* <F5> */
+       [72]    = DEFINE_SC(0x0b),      /* <F6> */
+       [73]    = DEFINE_SC(0x83),      /* <F7> */
+       [74]    = DEFINE_SC(0x0a),      /* <F8> */
+       [75]    = DEFINE_SC(0x01),      /* <F9> */
+       [76]    = DEFINE_SC(0x09),      /* <F10> */
+
+       [79]    = DEFINE_SC(0x6c),      /* keypad 7 */
+       [80]    = DEFINE_SC(0x75),      /* keypad 8 */
+       [81]    = DEFINE_SC(0x7d),      /* keypad 9 */
+       [82]    = DEFINE_SC(0x7b),      /* keypad - */
+       [83]    = DEFINE_SC(0x6b),      /* keypad 4 */
+       [84]    = DEFINE_SC(0x73),      /* keypad 5 */
+       [85]    = DEFINE_SC(0x74),      /* keypad 6 */
+       [86]    = DEFINE_SC(0x79),      /* keypad + */
+       [87]    = DEFINE_SC(0x69),      /* keypad 1 */
+       [88]    = DEFINE_SC(0x72),      /* keypad 2 */
+       [89]    = DEFINE_SC(0x7a),      /* keypad 3 */
+       [90]    = DEFINE_SC(0x70),      /* keypad 0 */
+       [91]    = DEFINE_SC(0x71),      /* keypad . */
+
+       [94]    = DEFINE_SC(0x61),      /* <INT 1> */
+       [95]    = DEFINE_SC(0x78),      /* <F11> */
+       [96]    = DEFINE_SC(0x07),      /* <F12> */
+
+       [104]   = DEFINE_ESC(0x5a),     /* keypad <enter> */
+       [105]   = DEFINE_ESC(0x14),     /* <right ctrl> */
+       [106]   = DEFINE_ESC(0x4a),     /* keypad / */
+       [108]   = DEFINE_ESC(0x11),     /* <right alt> */
+       [110]   = DEFINE_ESC(0x6c),     /* <home> */
+       [111]   = DEFINE_ESC(0x75),     /* <up> */
+       [112]   = DEFINE_ESC(0x7d),     /* <pag up> */
+       [113]   = DEFINE_ESC(0x6b),     /* <left> */
+       [114]   = DEFINE_ESC(0x74),     /* <right> */
+       [115]   = DEFINE_ESC(0x69),     /* <end> */
+       [116]   = DEFINE_ESC(0x72),     /* <down> */
+       [117]   = DEFINE_ESC(0x7a),     /* <pag down> */
+       [118]   = DEFINE_ESC(0x70),     /* <ins> */
+       [119]   = DEFINE_ESC(0x71),     /* <delete> */
+};
+static bool running, done;
+
+static const struct set2_scancode *to_code(u8 scancode)
+{
+       return &keymap[scancode];
+}
+
+static void key_press(const struct set2_scancode *sc)
+{
+       switch (sc->type) {
+       case SCANCODE_ESCAPED:
+               kbd_queue(0xe0);
+               /* fallthrough */
+       case SCANCODE_NORMAL:
+               kbd_queue(sc->code);
+               break;
+       case SCANCODE_KEY_PAUSE:
+               kbd_queue(0xe1);
+               kbd_queue(0x14);
+               kbd_queue(0x77);
+               kbd_queue(0xe1);
+               kbd_queue(0xf0);
+               kbd_queue(0x14);
+               kbd_queue(0x77);
+               break;
+       case SCANCODE_KEY_PRNTSCRN:
+               kbd_queue(0xe0);
+               kbd_queue(0x12);
+               kbd_queue(0xe0);
+               kbd_queue(0x7c);
+               break;
+       }
+}
+
+static void key_release(const struct set2_scancode *sc)
+{
+       switch (sc->type) {
+       case SCANCODE_ESCAPED:
+               kbd_queue(0xe0);
+               /* fallthrough */
+       case SCANCODE_NORMAL:
+               kbd_queue(0xf0);
+               kbd_queue(sc->code);
+               break;
+       case SCANCODE_KEY_PAUSE:
+               /* nothing to do */
+               break;
+       case SCANCODE_KEY_PRNTSCRN:
+               kbd_queue(0xe0);
+               kbd_queue(0xf0);
+               kbd_queue(0x7c);
+               kbd_queue(0xe0);
+               kbd_queue(0xf0);
+               kbd_queue(0x12);
+               break;
+       }
+}
+
+static void *sdl__thread(void *p)
+{
+       Uint32 rmask, gmask, bmask, amask;
+       struct framebuffer *fb = p;
+       SDL_Surface *guest_screen;
+       SDL_Surface *screen;
+       SDL_Event ev;
+       Uint32 flags;
+
+       if (SDL_Init(SDL_INIT_VIDEO) != 0)
+               die("Unable to initialize SDL");
+
+       rmask = 0x000000ff;
+       gmask = 0x0000ff00;
+       bmask = 0x00ff0000;
+       amask = 0x00000000;
+
+       guest_screen = SDL_CreateRGBSurfaceFrom(fb->mem, fb->width, fb->height, fb->depth, fb->width * fb->depth / 8, rmask, gmask, bmask, amask);
+       if (!guest_screen)
+               die("Unable to create SDL RBG surface");
+
+       flags = SDL_HWSURFACE | SDL_ASYNCBLIT | SDL_HWACCEL | SDL_DOUBLEBUF;
+
+       SDL_WM_SetCaption("KVM tool", "KVM tool");
+
+       screen = SDL_SetVideoMode(fb->width, fb->height, fb->depth, flags);
+       if (!screen)
+               die("Unable to set SDL video mode");
+
+       SDL_EnableKeyRepeat(200, 50);
+
+       while (running) {
+               SDL_BlitSurface(guest_screen, NULL, screen, NULL);
+               SDL_Flip(screen);
+
+               while (SDL_PollEvent(&ev)) {
+                       switch (ev.type) {
+                       case SDL_KEYDOWN: {
+                               const struct set2_scancode *sc = to_code(ev.key.keysym.scancode);
+                               if (sc->type == SCANCODE_UNKNOWN) {
+                                       pr_warning("key '%d' not found in keymap", ev.key.keysym.scancode);
+                                       break;
+                               }
+                               key_press(sc);
+                               break;
+                       }
+                       case SDL_KEYUP: {
+                               const struct set2_scancode *sc = to_code(ev.key.keysym.scancode);
+                               if (sc->type == SCANCODE_UNKNOWN)
+                                       break;
+                               key_release(sc);
+                               break;
+                       }
+                       case SDL_QUIT:
+                               goto exit;
+                       }
+               }
+
+               SDL_Delay(1000 / FRAME_RATE);
+       }
+
+       if (running == false && done == false) {
+               done = true;
+               return NULL;
+       }
+exit:
+       kvm_cpu__reboot();
+
+       return NULL;
+}
+
+static int sdl__start(struct framebuffer *fb)
+{
+       pthread_t thread;
+
+       running = true;
+
+       if (pthread_create(&thread, NULL, sdl__thread, fb) != 0)
+               return -1;
+
+       return 0;
+}
+
+static int sdl__stop(struct framebuffer *fb)
+{
+       running = false;
+       while (done == false)
+               sleep(0);
+
+       return 0;
+}
+
+static struct fb_target_operations sdl_ops = {
+       .start  = sdl__start,
+       .stop   = sdl__stop,
+};
+
+int sdl__init(struct framebuffer *fb)
+{
+       return fb__attach(fb, &sdl_ops);
+}
+
+int sdl__exit(struct framebuffer *fb)
+{
+       return sdl__stop(fb);
+}
diff --git a/tools/kvm/ui/vnc.c b/tools/kvm/ui/vnc.c

new file mode 100644 (file)

index 0000000..91254c5
--- /dev/null
+++ b/tools/kvm/ui/vnc.c
@@ -0,0 +1,230 @@
+#include "kvm/vnc.h"
+
+#include "kvm/framebuffer.h"
+#include "kvm/i8042.h"
+
+#include <linux/types.h>
+#include <rfb/keysym.h>
+#include <rfb/rfb.h>
+#include <pthread.h>
+
+#define VESA_QUEUE_SIZE                128
+#define VESA_IRQ               14
+
+/*
+ * This "6000" value is pretty much the result of experimentation
+ * It seems that around this value, things update pretty smoothly
+ */
+#define VESA_UPDATE_TIME       6000
+
+/*
+ * We can map the letters and numbers without a fuss,
+ * but the other characters not so much.
+ */
+static char letters[26] = {
+       0x1c, 0x32, 0x21, 0x23, 0x24, /* a-e */
+       0x2b, 0x34, 0x33, 0x43, 0x3b, /* f-j */
+       0x42, 0x4b, 0x3a, 0x31, 0x44, /* k-o */
+       0x4d, 0x15, 0x2d, 0x1b, 0x2c, /* p-t */
+       0x3c, 0x2a, 0x1d, 0x22, 0x35, /* u-y */
+       0x1a,
+};
+
+static rfbScreenInfoPtr server;
+static char num[10] = {
+       0x45, 0x16, 0x1e, 0x26, 0x2e, 0x23, 0x36, 0x3d, 0x3e, 0x46,
+};
+
+/*
+ * This is called when the VNC server receives a key event
+ * The reason this function is such a beast is that we have
+ * to convert from ASCII characters (which is what VNC gets)
+ * to PC keyboard scancodes, which is what Linux expects to
+ * get from its keyboard. ASCII and the scancode set don't
+ * really seem to mesh in any good way beyond some basics with
+ * the letters and numbers.
+ */
+static void kbd_handle_key(rfbBool down, rfbKeySym key, rfbClientPtr cl)
+{
+       char tosend = 0;
+
+       if (key >= 0x41 && key <= 0x5a)
+               key += 0x20; /* convert to lowercase */
+
+       if (key >= 0x61 && key <= 0x7a) /* a-z */
+               tosend = letters[key - 0x61];
+
+       if (key >= 0x30 && key <= 0x39)
+               tosend = num[key - 0x30];
+
+       switch (key) {
+       case XK_Insert:         kbd_queue(0xe0);        tosend = 0x70;  break;
+       case XK_Delete:         kbd_queue(0xe0);        tosend = 0x71;  break;
+       case XK_Up:             kbd_queue(0xe0);        tosend = 0x75;  break;
+       case XK_Down:           kbd_queue(0xe0);        tosend = 0x72;  break;
+       case XK_Left:           kbd_queue(0xe0);        tosend = 0x6b;  break;
+       case XK_Right:          kbd_queue(0xe0);        tosend = 0x74;  break;
+       case XK_Page_Up:        kbd_queue(0xe0);        tosend = 0x7d;  break;
+       case XK_Page_Down:      kbd_queue(0xe0);        tosend = 0x7a;  break;
+       case XK_Home:           kbd_queue(0xe0);        tosend = 0x6c;  break;
+       case XK_BackSpace:      tosend = 0x66;          break;
+       case XK_Tab:            tosend = 0x0d;          break;
+       case XK_Return:         tosend = 0x5a;          break;
+       case XK_Escape:         tosend = 0x76;          break;
+       case XK_End:            tosend = 0x69;          break;
+       case XK_Shift_L:        tosend = 0x12;          break;
+       case XK_Shift_R:        tosend = 0x59;          break;
+       case XK_Control_R:      kbd_queue(0xe0);
+       case XK_Control_L:      tosend = 0x14;          break;
+       case XK_Alt_R:          kbd_queue(0xe0);
+       case XK_Alt_L:          tosend = 0x11;          break;
+       case XK_quoteleft:      tosend = 0x0e;          break;
+       case XK_minus:          tosend = 0x4e;          break;
+       case XK_equal:          tosend = 0x55;          break;
+       case XK_bracketleft:    tosend = 0x54;          break;
+       case XK_bracketright:   tosend = 0x5b;          break;
+       case XK_backslash:      tosend = 0x5d;          break;
+       case XK_Caps_Lock:      tosend = 0x58;          break;
+       case XK_semicolon:      tosend = 0x4c;          break;
+       case XK_quoteright:     tosend = 0x52;          break;
+       case XK_comma:          tosend = 0x41;          break;
+       case XK_period:         tosend = 0x49;          break;
+       case XK_slash:          tosend = 0x4a;          break;
+       case XK_space:          tosend = 0x29;          break;
+
+       /*
+        * This is where I handle the shifted characters.
+        * They don't really map nicely the way A-Z maps to a-z,
+        * so I'm doing it manually
+        */
+       case XK_exclam:         tosend = 0x16;          break;
+       case XK_quotedbl:       tosend = 0x52;          break;
+       case XK_numbersign:     tosend = 0x26;          break;
+       case XK_dollar:         tosend = 0x25;          break;
+       case XK_percent:        tosend = 0x2e;          break;
+       case XK_ampersand:      tosend = 0x3d;          break;
+       case XK_parenleft:      tosend = 0x46;          break;
+       case XK_parenright:     tosend = 0x45;          break;
+       case XK_asterisk:       tosend = 0x3e;          break;
+       case XK_plus:           tosend = 0x55;          break;
+       case XK_colon:          tosend = 0x4c;          break;
+       case XK_less:           tosend = 0x41;          break;
+       case XK_greater:        tosend = 0x49;          break;
+       case XK_question:       tosend = 0x4a;          break;
+       case XK_at:             tosend = 0x1e;          break;
+       case XK_asciicircum:    tosend = 0x36;          break;
+       case XK_underscore:     tosend = 0x4e;          break;
+       case XK_braceleft:      tosend = 0x54;          break;
+       case XK_braceright:     tosend = 0x5b;          break;
+       case XK_bar:            tosend = 0x5d;          break;
+       case XK_asciitilde:     tosend = 0x0e;          break;
+       default:                break;
+       }
+
+       /*
+        * If this is a "key up" event (the user has released the key, we
+        * need to send 0xf0 first.
+        */
+       if (!down && tosend != 0x0)
+               kbd_queue(0xf0);
+
+       if (tosend)
+               kbd_queue(tosend);
+}
+
+/* The previous X and Y coordinates of the mouse */
+static int xlast, ylast = -1;
+
+/*
+ * This function is called by the VNC server whenever a mouse event occurs.
+ */
+static void kbd_handle_ptr(int buttonMask, int x, int y, rfbClientPtr cl)
+{
+       int dx, dy;
+       char b1 = 0x8;
+
+       /* The VNC mask and the PS/2 button encoding are the same */
+       b1 |= buttonMask;
+
+       if (xlast >= 0 && ylast >= 0) {
+               /* The PS/2 mouse sends deltas, not absolutes */
+               dx = x - xlast;
+               dy = ylast - y;
+
+               /* Set overflow bits if needed */
+               if (dy > 255)
+                       b1 |= 0x80;
+               if (dx > 255)
+                       b1 |= 0x40;
+
+               /* Set negative bits if needed */
+               if (dy < 0)
+                       b1 |= 0x20;
+               if (dx < 0)
+                       b1 |= 0x10;
+
+               mouse_queue(b1);
+               mouse_queue(dx);
+               mouse_queue(dy);
+       }
+
+       xlast = x;
+       ylast = y;
+       rfbDefaultPtrAddEvent(buttonMask, x, y, cl);
+}
+
+static void *vnc__thread(void *p)
+{
+       struct framebuffer *fb = p;
+       /*
+        * Make a fake argc and argv because the getscreen function
+        * seems to want it.
+        */
+       char argv[1][1] = {{0}};
+       int argc = 1;
+
+       server = rfbGetScreen(&argc, (char **) argv, fb->width, fb->height, 8, 3, 4);
+       server->frameBuffer             = fb->mem;
+       server->alwaysShared            = TRUE;
+       server->kbdAddEvent             = kbd_handle_key;
+       server->ptrAddEvent             = kbd_handle_ptr;
+       rfbInitServer(server);
+
+       while (rfbIsActive(server)) {
+               rfbMarkRectAsModified(server, 0, 0, fb->width, fb->height);
+               rfbProcessEvents(server, server->deferUpdateTime * VESA_UPDATE_TIME);
+       }
+       return NULL;
+}
+
+static int vnc__start(struct framebuffer *fb)
+{
+       pthread_t thread;
+
+       if (pthread_create(&thread, NULL, vnc__thread, fb) != 0)
+               return -1;
+
+       return 0;
+}
+
+static int vnc__stop(struct framebuffer *fb)
+{
+       rfbShutdownServer(server, TRUE);
+
+       return 0;
+}
+
+static struct fb_target_operations vnc_ops = {
+       .start  = vnc__start,
+       .stop   = vnc__stop,
+};
+
+int vnc__init(struct framebuffer *fb)
+{
+       return fb__attach(fb, &vnc_ops);
+}
+
+int vnc__exit(struct framebuffer *fb)
+{
+       return vnc__stop(fb);
+}
+\ No newline at end of file
diff --git a/tools/kvm/util/KVMTOOLS-VERSION-GEN b/tools/kvm/util/KVMTOOLS-VERSION-GEN

new file mode 100755 (executable)

index 0000000..1af9d6c
--- /dev/null
+++ b/tools/kvm/util/KVMTOOLS-VERSION-GEN
@@ -0,0 +1,40 @@
+#!/bin/sh
+
+if [ $# -eq 1 ]  ; then
+       OUTPUT=$1
+fi
+
+GVF=${OUTPUT}KVMTOOLS-VERSION-FILE
+
+LF='
+'
+
+# First check if there is a .git to get the version from git describe
+# otherwise try to get the version from the kernel makefile
+if test -d ../../.git -o -f ../../.git &&
+       VN=$(git describe --abbrev=4 HEAD 2>/dev/null) &&
+       case "$VN" in
+       *$LF*) (exit 1) ;;
+       v[0-9]*)
+               git update-index -q --refresh
+               test -z "$(git diff-index --name-only HEAD --)" ||
+               VN="$VN-dirty" ;;
+       esac
+then
+       VN=$(echo "$VN" | sed -e 's/-/./g');
+else
+       VN=$(MAKEFLAGS= make -sC ../.. kernelversion)
+fi
+
+VN=$(expr "$VN" : v*'\(.*\)')
+
+if test -r $GVF
+then
+       VC=$(sed -e 's/^KVMTOOLS_VERSION = //' <$GVF)
+else
+       VC=unset
+fi
+test "$VN" = "$VC" || {
+       echo >&2 "KVMTOOLS_VERSION = $VN"
+       echo "KVMTOOLS_VERSION = $VN" >$GVF
+}
diff --git a/tools/kvm/util/generate-cmdlist.sh b/tools/kvm/util/generate-cmdlist.sh

new file mode 100755 (executable)

index 0000000..c8be0bd
--- /dev/null
+++ b/tools/kvm/util/generate-cmdlist.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+echo "/* Automatically generated by $0 */
+struct cmdname_help
+{
+    char name[16];
+    char help[80];
+};
+
+static struct cmdname_help common_cmds[] = {"
+
+sed -n 's/^lkvm-\([^ \t]*\).*common/\1/p' command-list.txt |
+while read cmd
+do
+        # TODO following sed command should be fixed
+     sed -n '/^NAME/,/^lkvm-'"$cmd"'/ {
+                /NAME/d
+                /--/d
+                s/.*kvm-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
+            p
+        }' "Documentation/kvm-$cmd.txt"
+done
+echo "};"
diff --git a/tools/kvm/util/kvm-ifup-vbr0 b/tools/kvm/util/kvm-ifup-vbr0

new file mode 100755 (executable)

index 0000000..a91c37f
--- /dev/null
+++ b/tools/kvm/util/kvm-ifup-vbr0
@@ -0,0 +1,6 @@
+#!/bin/sh
+switch=vbr0
+/sbin/ifconfig $1 0.0.0.0 up
+/usr/sbin/brctl addif ${switch} $1
+/usr/sbin/brctl setfd ${switch} 0
+/usr/sbin/brctl stp ${switch} off
diff --git a/tools/kvm/util/parse-options.c b/tools/kvm/util/parse-options.c

new file mode 100644 (file)

index 0000000..9a1bbee
--- /dev/null
+++ b/tools/kvm/util/parse-options.c
@@ -0,0 +1,577 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <stdbool.h>
+
+/* user defined includes */
+#include <linux/types.h>
+#include <kvm/util.h>
+#include <kvm/parse-options.h>
+#include <kvm/strbuf.h>
+
+#define OPT_SHORT 1
+#define OPT_UNSET 2
+
+static int opterror(const struct option *opt, const char *reason, int flags)
+{
+       if (flags & OPT_SHORT)
+               return pr_err("switch `%c' %s", opt->short_name, reason);
+       if (flags & OPT_UNSET)
+               return pr_err("option `no-%s' %s", opt->long_name, reason);
+       return pr_err("option `%s' %s", opt->long_name, reason);
+}
+
+static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
+               int flags, const char **arg)
+{
+       if (p->opt) {
+               *arg = p->opt;
+               p->opt = NULL;
+       } else if ((opt->flags & PARSE_OPT_LASTARG_DEFAULT) && (p->argc == 1 ||
+                               **(p->argv + 1) == '-')) {
+               *arg = (const char *)opt->defval;
+       } else if (p->argc > 1) {
+               p->argc--;
+               *arg = *++p->argv;
+       } else
+               return opterror(opt, "requires a value", flags);
+       return 0;
+}
+
+static int readnum(const struct option *opt, int flags,
+                  const char *str, char **end)
+{
+       switch (opt->type) {
+       case OPTION_INTEGER:
+               *(int *)opt->value = strtol(str, end, 0);
+               break;
+       case OPTION_UINTEGER:
+               *(unsigned int *)opt->value = strtol(str, end, 0);
+               break;
+       case OPTION_LONG:
+               *(long *)opt->value = strtol(str, end, 0);
+               break;
+       case OPTION_U64:
+               *(u64 *)opt->value = strtoull(str, end, 0);
+               break;
+       default:
+               return opterror(opt, "invalid numeric conversion", flags);
+       }
+
+       return 0;
+}
+
+static int get_value(struct parse_opt_ctx_t *p,
+               const struct option *opt, int flags)
+{
+       const char *s, *arg = NULL;
+       const int unset = flags & OPT_UNSET;
+
+       if (unset && p->opt)
+               return opterror(opt, "takes no value", flags);
+       if (unset && (opt->flags & PARSE_OPT_NONEG))
+               return opterror(opt, "isn't available", flags);
+
+       if (!(flags & OPT_SHORT) && p->opt) {
+               switch (opt->type) {
+               case OPTION_CALLBACK:
+                       if (!(opt->flags & PARSE_OPT_NOARG))
+                               break;
+               /* FALLTHROUGH */
+               case OPTION_BOOLEAN:
+               case OPTION_INCR:
+               case OPTION_BIT:
+               case OPTION_SET_UINT:
+               case OPTION_SET_PTR:
+                       return opterror(opt, "takes no value", flags);
+               case OPTION_END:
+               case OPTION_ARGUMENT:
+               case OPTION_GROUP:
+               case OPTION_STRING:
+               case OPTION_INTEGER:
+               case OPTION_UINTEGER:
+               case OPTION_LONG:
+               case OPTION_U64:
+               default:
+                       break;
+               }
+       }
+
+       switch (opt->type) {
+       case OPTION_BIT:
+               if (unset)
+                       *(int *)opt->value &= ~opt->defval;
+               else
+                       *(int *)opt->value |= opt->defval;
+               return 0;
+
+       case OPTION_BOOLEAN:
+               *(bool *)opt->value = unset ? false : true;
+               return 0;
+
+       case OPTION_INCR:
+               *(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
+               return 0;
+
+       case OPTION_SET_UINT:
+               *(unsigned int *)opt->value = unset ? 0 : opt->defval;
+               return 0;
+
+       case OPTION_SET_PTR:
+               *(void **)opt->value = unset ? NULL : (void *)opt->defval;
+               return 0;
+
+       case OPTION_STRING:
+               if (unset)
+                       *(const char **)opt->value = NULL;
+               else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+                       *(const char **)opt->value = (const char *)opt->defval;
+               else
+                       return get_arg(p, opt, flags,
+                                       (const char **)opt->value);
+               return 0;
+
+       case OPTION_CALLBACK:
+               if (unset)
+                       return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
+               if (opt->flags & PARSE_OPT_NOARG)
+                       return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+               if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+                       return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+               if (get_arg(p, opt, flags, &arg))
+                       return -1;
+               return (*opt->callback)(opt, arg, 0) ? (-1) : 0;
+
+       case OPTION_INTEGER:
+               if (unset) {
+                       *(int *)opt->value = 0;
+                       return 0;
+               }
+               if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+                       *(int *)opt->value = opt->defval;
+                       return 0;
+               }
+               if (get_arg(p, opt, flags, &arg))
+                       return -1;
+               return readnum(opt, flags, arg, (char **)&s);
+
+       case OPTION_UINTEGER:
+               if (unset) {
+                       *(unsigned int *)opt->value = 0;
+                       return 0;
+               }
+               if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+                       *(unsigned int *)opt->value = opt->defval;
+                       return 0;
+               }
+               if (get_arg(p, opt, flags, &arg))
+                       return -1;
+               return readnum(opt, flags, arg, (char **)&s);
+
+       case OPTION_LONG:
+               if (unset) {
+                       *(long *)opt->value = 0;
+                       return 0;
+               }
+               if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+                       *(long *)opt->value = opt->defval;
+                       return 0;
+               }
+               if (get_arg(p, opt, flags, &arg))
+                       return -1;
+               return readnum(opt, flags, arg, (char **)&s);
+
+       case OPTION_U64:
+               if (unset) {
+                       *(u64 *)opt->value = 0;
+                       return 0;
+               }
+               if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+                       *(u64 *)opt->value = opt->defval;
+                       return 0;
+               }
+               if (get_arg(p, opt, flags, &arg))
+                       return -1;
+               return readnum(opt, flags, arg, (char **)&s);
+
+       case OPTION_END:
+       case OPTION_ARGUMENT:
+       case OPTION_GROUP:
+       default:
+               die("should not happen, someone must be hit on the forehead");
+       }
+}
+
+#define USAGE_OPTS_WIDTH 24
+#define USAGE_GAP         2
+
+static int usage_with_options_internal(const char * const *usagestr,
+               const struct option *opts, int full)
+{
+       if (!usagestr)
+               return PARSE_OPT_HELP;
+
+       fprintf(stderr, "\n usage: %s\n", *usagestr++);
+       while (*usagestr && **usagestr)
+               fprintf(stderr, "    or: %s\n", *usagestr++);
+       while (*usagestr) {
+               fprintf(stderr, "%s%s\n",
+                               **usagestr ? "    " : "",
+                               *usagestr);
+               usagestr++;
+       }
+
+       if (opts->type != OPTION_GROUP)
+               fputc('\n', stderr);
+
+       for (; opts->type != OPTION_END; opts++) {
+               size_t pos;
+               int pad;
+
+               if (opts->type == OPTION_GROUP) {
+                       fputc('\n', stderr);
+                       if (*opts->help)
+                               fprintf(stderr, "%s\n", opts->help);
+                       continue;
+               }
+               if (!full && (opts->flags & PARSE_OPT_HIDDEN))
+                       continue;
+
+               pos = fprintf(stderr, "    ");
+               if (opts->short_name)
+                       pos += fprintf(stderr, "-%c", opts->short_name);
+               else
+                       pos += fprintf(stderr, "    ");
+
+               if (opts->long_name && opts->short_name)
+                       pos += fprintf(stderr, ", ");
+               if (opts->long_name)
+                       pos += fprintf(stderr, "--%s", opts->long_name);
+
+               switch (opts->type) {
+               case OPTION_ARGUMENT:
+                       break;
+               case OPTION_LONG:
+               case OPTION_U64:
+               case OPTION_INTEGER:
+               case OPTION_UINTEGER:
+                       if (opts->flags & PARSE_OPT_OPTARG)
+                               if (opts->long_name)
+                                       pos += fprintf(stderr, "[=<n>]");
+                               else
+                                       pos += fprintf(stderr, "[<n>]");
+                       else
+                               pos += fprintf(stderr, " <n>");
+                       break;
+               case OPTION_CALLBACK:
+                       if (opts->flags & PARSE_OPT_NOARG)
+                               break;
+               /* FALLTHROUGH */
+               case OPTION_STRING:
+                       if (opts->argh) {
+                               if (opts->flags & PARSE_OPT_OPTARG)
+                                       if (opts->long_name)
+                                               pos += fprintf(stderr, "[=<%s>]", opts->argh);
+                                       else
+                                               pos += fprintf(stderr, "[<%s>]", opts->argh);
+                               else
+                                       pos += fprintf(stderr, " <%s>", opts->argh);
+                       } else {
+                               if (opts->flags & PARSE_OPT_OPTARG)
+                                       if (opts->long_name)
+                                               pos += fprintf(stderr, "[=...]");
+                                       else
+                                               pos += fprintf(stderr, "[...]");
+                               else
+                                       pos += fprintf(stderr, " ...");
+                       }
+                               break;
+               default: /* OPTION_{BIT,BOOLEAN,SET_UINT,SET_PTR} */
+               case OPTION_END:
+               case OPTION_GROUP:
+               case OPTION_BIT:
+               case OPTION_BOOLEAN:
+               case OPTION_INCR:
+               case OPTION_SET_UINT:
+               case OPTION_SET_PTR:
+                       break;
+               }
+               if (pos <= USAGE_OPTS_WIDTH)
+                       pad = USAGE_OPTS_WIDTH - pos;
+               else {
+                       fputc('\n', stderr);
+                       pad = USAGE_OPTS_WIDTH;
+               }
+               fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
+       }
+       fputc('\n', stderr);
+
+       return PARSE_OPT_HELP;
+}
+
+void usage_with_options(const char * const *usagestr,
+               const struct option *opts)
+{
+       usage_with_options_internal(usagestr, opts, 0);
+       exit(129);
+}
+
+static void check_typos(const char *arg, const struct option *options)
+{
+       if (strlen(arg) < 3)
+               return;
+
+       if (!prefixcmp(arg, "no-")) {
+               pr_err("did you mean `--%s` (with two dashes ?)", arg);
+               exit(129);
+       }
+
+       for (; options->type != OPTION_END; options++) {
+               if (!options->long_name)
+                       continue;
+               if (!prefixcmp(options->long_name, arg)) {
+                       pr_err("did you mean `--%s` (with two dashes ?)", arg);
+                       exit(129);
+               }
+       }
+}
+
+static int parse_options_usage(const char * const *usagestr,
+               const struct option *opts)
+{
+       return usage_with_options_internal(usagestr, opts, 0);
+}
+
+static int parse_short_opt(struct parse_opt_ctx_t *p,
+        const struct option *options)
+{
+       for (; options->type != OPTION_END; options++) {
+               if (options->short_name == *p->opt) {
+                       p->opt = p->opt[1] ? p->opt + 1 : NULL;
+                       return get_value(p, options, OPT_SHORT);
+               }
+       }
+       return -2;
+}
+
+static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg,
+               const struct option *options)
+{
+       const char *arg_end = strchr(arg, '=');
+       const struct option *abbrev_option = NULL, *ambiguous_option = NULL;
+       int abbrev_flags = 0, ambiguous_flags = 0;
+
+       if (!arg_end)
+               arg_end = arg + strlen(arg);
+
+       for (; options->type != OPTION_END; options++) {
+               const char *rest;
+               int flags = 0;
+
+               if (!options->long_name)
+                       continue;
+
+               rest = skip_prefix(arg, options->long_name);
+               if (options->type == OPTION_ARGUMENT) {
+                       if (!rest)
+                               continue;
+                       if (*rest == '=')
+                               return opterror(options, "takes no value",
+                                               flags);
+                       if (*rest)
+                               continue;
+                       p->out[p->cpidx++] = arg - 2;
+                       return 0;
+               }
+               if (!rest) {
+                       /* abbreviated? */
+                       if (!strncmp(options->long_name, arg, arg_end - arg)) {
+is_abbreviated:
+                               if (abbrev_option) {
+                                       /*
+                                        * If this is abbreviated, it is
+                                        * ambiguous. So when there is no
+                                        * exact match later, we need to
+                                        * error out.
+                                        */
+                                       ambiguous_option = abbrev_option;
+                                       ambiguous_flags = abbrev_flags;
+                               }
+                               if (!(flags & OPT_UNSET) && *arg_end)
+                                       p->opt = arg_end + 1;
+                               abbrev_option = options;
+                               abbrev_flags = flags;
+                               continue;
+                       }
+                       /* negated and abbreviated very much? */
+                       if (!prefixcmp("no-", arg)) {
+                               flags |= OPT_UNSET;
+                               goto is_abbreviated;
+                       }
+                       /* negated? */
+                       if (strncmp(arg, "no-", 3))
+                               continue;
+                       flags |= OPT_UNSET;
+                       rest = skip_prefix(arg + 3, options->long_name);
+                       /* abbreviated and negated? */
+                       if (!rest && !prefixcmp(options->long_name, arg + 3))
+                               goto is_abbreviated;
+                       if (!rest)
+                               continue;
+               }
+               if (*rest) {
+                       if (*rest != '=')
+                               continue;
+                       p->opt = rest + 1;
+               }
+               return get_value(p, options, flags);
+       }
+
+       if (ambiguous_option)
+               return pr_err("Ambiguous option: %s "
+                               "(could be --%s%s or --%s%s)",
+                               arg,
+                               (ambiguous_flags & OPT_UNSET) ?  "no-" : "",
+                               ambiguous_option->long_name,
+                               (abbrev_flags & OPT_UNSET) ?  "no-" : "",
+                               abbrev_option->long_name);
+       if (abbrev_option)
+               return get_value(p, abbrev_option, abbrev_flags);
+       return -2;
+}
+
+
+static void parse_options_start(struct parse_opt_ctx_t *ctx, int argc,
+               const char **argv, int flags)
+{
+       memset(ctx, 0, sizeof(*ctx));
+       ctx->argc = argc;
+       ctx->argv = argv;
+       ctx->out  = argv;
+       ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0);
+       ctx->flags = flags;
+       if ((flags & PARSE_OPT_KEEP_UNKNOWN) &&
+                       (flags & PARSE_OPT_STOP_AT_NON_OPTION))
+               die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together");
+}
+
+static int parse_options_end(struct parse_opt_ctx_t *ctx)
+{
+       memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
+       ctx->out[ctx->cpidx + ctx->argc] = NULL;
+       return ctx->cpidx + ctx->argc;
+}
+
+
+static int parse_options_step(struct parse_opt_ctx_t *ctx,
+               const struct option *options, const char * const usagestr[])
+{
+       int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
+
+       /* we must reset ->opt, unknown short option leave it dangling */
+       ctx->opt = NULL;
+
+       for (; ctx->argc; ctx->argc--, ctx->argv++) {
+               const char *arg = ctx->argv[0];
+
+               if (*arg != '-' || !arg[1]) {
+                       if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION)
+                               break;
+                       ctx->out[ctx->cpidx++] = ctx->argv[0];
+                       continue;
+               }
+
+               if (arg[1] != '-') {
+                       ctx->opt = arg + 1;
+                       if (internal_help && *ctx->opt == 'h')
+                               return parse_options_usage(usagestr, options);
+                       switch (parse_short_opt(ctx, options)) {
+                       case -1:
+                               return parse_options_usage(usagestr, options);
+                       case -2:
+                               goto unknown;
+                       default:
+                               break;
+                       }
+                       if (ctx->opt)
+                               check_typos(arg + 1, options);
+                       while (ctx->opt) {
+                               if (internal_help && *ctx->opt == 'h')
+                                       return parse_options_usage(usagestr,
+                                                       options);
+                               switch (parse_short_opt(ctx, options)) {
+                               case -1:
+                                       return parse_options_usage(usagestr,
+                                                       options);
+                               case -2:
+                                       /* fake a short option thing to hide
+                                        * the fact that we may have
+                                        * started to parse aggregated stuff
+                                        *
+                                        * This is leaky, too bad.
+                                        */
+                                       ctx->argv[0] = strdup(ctx->opt - 1);
+                                       *(char *)ctx->argv[0] = '-';
+                                       goto unknown;
+                               default:
+                                       break;
+                               }
+                       }
+                       continue;
+               }
+
+               if (!arg[2]) { /* "--" */
+                       if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) {
+                               ctx->argc--;
+                               ctx->argv++;
+                       }
+                       break;
+               }
+
+               if (internal_help && !strcmp(arg + 2, "help-all"))
+                       return usage_with_options_internal(usagestr, options,
+                                       1);
+               if (internal_help && !strcmp(arg + 2, "help"))
+                       return parse_options_usage(usagestr, options);
+               switch (parse_long_opt(ctx, arg + 2, options)) {
+               case -1:
+                       return parse_options_usage(usagestr, options);
+               case -2:
+                       goto unknown;
+               default:
+                       break;
+               }
+               continue;
+unknown:
+               if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN))
+                       return PARSE_OPT_UNKNOWN;
+               ctx->out[ctx->cpidx++] = ctx->argv[0];
+               ctx->opt = NULL;
+       }
+       return PARSE_OPT_DONE;
+}
+
+int parse_options(int argc, const char **argv, const struct option *options,
+               const char * const usagestr[], int flags)
+{
+       struct parse_opt_ctx_t ctx;
+
+       parse_options_start(&ctx, argc, argv, flags);
+       switch (parse_options_step(&ctx, options, usagestr)) {
+       case PARSE_OPT_HELP:
+               exit(129);
+       case PARSE_OPT_DONE:
+               break;
+       default: /* PARSE_OPT_UNKNOWN */
+               if (ctx.argv[0][1] == '-') {
+                       pr_err("unknown option `%s'", ctx.argv[0] + 2);
+               } else {
+                       pr_err("unknown switch `%c'", *ctx.opt);
+               }
+               usage_with_options(usagestr, options);
+       }
+
+       return parse_options_end(&ctx);
+}
diff --git a/tools/kvm/util/rbtree-interval.c b/tools/kvm/util/rbtree-interval.c

new file mode 100644 (file)

index 0000000..f9bf4b8
--- /dev/null
+++ b/tools/kvm/util/rbtree-interval.c
@@ -0,0 +1,89 @@
+#include <kvm/rbtree-interval.h>
+#include <stddef.h>
+#include <errno.h>
+
+struct rb_int_node *rb_int_search_single(struct rb_root *root, u64 point)
+{
+       struct rb_node *node = root->rb_node;
+       struct rb_node *lowest = NULL;
+
+       while (node) {
+               struct rb_int_node *cur = rb_int(node);
+
+               if (node->rb_left && (rb_int(node->rb_left)->max_high > point)) {
+                       node = node->rb_left;
+               } else if (cur->low <= point && cur->high > point) {
+                       lowest = node;
+                       break;
+               } else if (point > cur->low) {
+                       node = node->rb_right;
+               } else {
+                       break;
+               }
+       }
+
+       if (lowest == NULL)
+               return NULL;
+
+       return rb_int(lowest);
+}
+
+struct rb_int_node *rb_int_search_range(struct rb_root *root, u64 low, u64 high)
+{
+       struct rb_int_node *range;
+
+       range = rb_int_search_single(root, low);
+       if (range == NULL)
+               return NULL;
+
+       /* We simply verify that 'high' is smaller than the end of the range where 'low' is located */
+       if (range->high < high)
+               return NULL;
+
+       return range;
+}
+
+static void update_node_max_high(struct rb_node *node, void *arg)
+{
+       struct rb_int_node *i_node = rb_int(node);
+
+       i_node->max_high = i_node->high;
+
+       if (node->rb_left)
+               i_node->max_high = max(i_node->max_high, rb_int(node->rb_left)->max_high);
+       if (node->rb_right)
+               i_node->max_high = max(i_node->max_high, rb_int(node->rb_right)->max_high);
+}
+
+int rb_int_insert(struct rb_root *root, struct rb_int_node *i_node)
+{
+       struct rb_node **node = &(root->rb_node), *parent = NULL;
+
+       while (*node) {
+               int result = i_node->low - rb_int(*node)->low;
+
+               parent = *node;
+               if (result < 0)
+                       node    = &((*node)->rb_left);
+               else if (result > 0)
+                       node    = &((*node)->rb_right);
+               else
+                       return -EEXIST;
+       }
+
+       rb_link_node(&i_node->node, parent, node);
+       rb_insert_color(&i_node->node, root);
+
+       rb_augment_insert(&i_node->node, update_node_max_high, NULL);
+       return 0;
+}
+
+void rb_int_erase(struct rb_root *root, struct rb_int_node *node)
+{
+       struct rb_node *deepest;
+
+       deepest = rb_augment_erase_begin(&node->node);
+       rb_erase(&node->node, root);
+       rb_augment_erase_end(deepest, update_node_max_high, NULL);
+
+}
diff --git a/tools/kvm/util/read-write.c b/tools/kvm/util/read-write.c

new file mode 100644 (file)

index 0000000..44709df
--- /dev/null
+++ b/tools/kvm/util/read-write.c
@@ -0,0 +1,354 @@
+#include "kvm/read-write.h"
+
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+/* Same as read(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xread(int fd, void *buf, size_t count)
+{
+       ssize_t nr;
+
+restart:
+       nr = read(fd, buf, count);
+       if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+               goto restart;
+
+       return nr;
+}
+
+/* Same as write(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xwrite(int fd, const void *buf, size_t count)
+{
+       ssize_t nr;
+
+restart:
+       nr = write(fd, buf, count);
+       if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+               goto restart;
+
+       return nr;
+}
+
+ssize_t read_in_full(int fd, void *buf, size_t count)
+{
+       ssize_t total = 0;
+       char *p = buf;
+
+       while (count > 0) {
+               ssize_t nr;
+
+               nr = xread(fd, p, count);
+               if (nr <= 0) {
+                       if (total > 0)
+                               return total;
+
+                       return -1;
+               }
+
+               count -= nr;
+               total += nr;
+               p += nr;
+       }
+
+       return total;
+}
+
+ssize_t write_in_full(int fd, const void *buf, size_t count)
+{
+       const char *p = buf;
+       ssize_t total = 0;
+
+       while (count > 0) {
+               ssize_t nr;
+
+               nr = xwrite(fd, p, count);
+               if (nr < 0)
+                       return -1;
+               if (nr == 0) {
+                       errno = ENOSPC;
+                       return -1;
+               }
+               count -= nr;
+               total += nr;
+               p += nr;
+       }
+
+       return total;
+}
+
+/* Same as pread(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpread(int fd, void *buf, size_t count, off_t offset)
+{
+       ssize_t nr;
+
+restart:
+       nr = pread(fd, buf, count, offset);
+       if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+               goto restart;
+
+       return nr;
+}
+
+/* Same as pwrite(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset)
+{
+       ssize_t nr;
+
+restart:
+       nr = pwrite(fd, buf, count, offset);
+       if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+               goto restart;
+
+       return nr;
+}
+
+ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset)
+{
+       ssize_t total = 0;
+       char *p = buf;
+
+       while (count > 0) {
+               ssize_t nr;
+
+               nr = xpread(fd, p, count, offset);
+               if (nr <= 0) {
+                       if (total > 0)
+                               return total;
+
+                       return -1;
+               }
+
+               count -= nr;
+               total += nr;
+               p += nr;
+               offset += nr;
+       }
+
+       return total;
+}
+
+ssize_t pwrite_in_full(int fd, const void *buf, size_t count, off_t offset)
+{
+       const char *p = buf;
+       ssize_t total = 0;
+
+       while (count > 0) {
+               ssize_t nr;
+
+               nr = xpwrite(fd, p, count, offset);
+               if (nr < 0)
+                       return -1;
+               if (nr == 0) {
+                       errno = ENOSPC;
+                       return -1;
+               }
+               count -= nr;
+               total += nr;
+               p += nr;
+               offset += nr;
+       }
+
+       return total;
+}
+
+/* Same as readv(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xreadv(int fd, const struct iovec *iov, int iovcnt)
+{
+       ssize_t nr;
+
+restart:
+       nr = readv(fd, iov, iovcnt);
+       if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+               goto restart;
+
+       return nr;
+}
+
+/* Same as writev(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xwritev(int fd, const struct iovec *iov, int iovcnt)
+{
+       ssize_t nr;
+
+restart:
+       nr = writev(fd, iov, iovcnt);
+       if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+               goto restart;
+
+       return nr;
+}
+
+static inline ssize_t get_iov_size(const struct iovec *iov, int iovcnt)
+{
+       size_t size = 0;
+       while (iovcnt--)
+               size += (iov++)->iov_len;
+
+       return size;
+}
+
+static inline void shift_iovec(const struct iovec **iov, int *iovcnt,
+                               size_t nr, ssize_t *total, size_t *count, off_t *offset)
+{
+       while (nr >= (*iov)->iov_len) {
+               nr -= (*iov)->iov_len;
+               *total += (*iov)->iov_len;
+               *count -= (*iov)->iov_len;
+               if (offset)
+                       *offset += (*iov)->iov_len;
+               (*iovcnt)--;
+               (*iov)++;
+       }
+}
+
+ssize_t readv_in_full(int fd, const struct iovec *iov, int iovcnt)
+{
+       ssize_t total = 0;
+       size_t count = get_iov_size(iov, iovcnt);
+
+       while (count > 0) {
+               ssize_t nr;
+
+               nr = xreadv(fd, iov, iovcnt);
+               if (nr <= 0) {
+                       if (total > 0)
+                               return total;
+
+                       return -1;
+               }
+
+               shift_iovec(&iov, &iovcnt, nr, &total, &count, NULL);
+       }
+
+       return total;
+}
+
+ssize_t writev_in_full(int fd, const struct iovec *iov, int iovcnt)
+{
+       ssize_t total = 0;
+       size_t count = get_iov_size(iov, iovcnt);
+
+       while (count > 0) {
+               ssize_t nr;
+
+               nr = xwritev(fd, iov, iovcnt);
+               if (nr < 0)
+                       return -1;
+               if (nr == 0) {
+                       errno = ENOSPC;
+                       return -1;
+               }
+
+               shift_iovec(&iov, &iovcnt, nr, &total, &count, NULL);
+       }
+
+       return total;
+}
+
+/* Same as preadv(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpreadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+       ssize_t nr;
+
+restart:
+       nr = preadv(fd, iov, iovcnt, offset);
+       if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+               goto restart;
+
+       return nr;
+}
+
+/* Same as pwritev(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+       ssize_t nr;
+
+restart:
+       nr = pwritev(fd, iov, iovcnt, offset);
+       if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+               goto restart;
+
+       return nr;
+}
+
+ssize_t preadv_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+       ssize_t total = 0;
+       size_t count = get_iov_size(iov, iovcnt);
+
+       while (count > 0) {
+               ssize_t nr;
+
+               nr = xpreadv(fd, iov, iovcnt, offset);
+               if (nr <= 0) {
+                       if (total > 0)
+                               return total;
+
+                       return -1;
+               }
+
+               shift_iovec(&iov, &iovcnt, nr, &total, &count, &offset);
+       }
+
+       return total;
+}
+
+ssize_t pwritev_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+       ssize_t total = 0;
+       size_t count = get_iov_size(iov, iovcnt);
+
+       while (count > 0) {
+               ssize_t nr;
+
+               nr = xpwritev(fd, iov, iovcnt, offset);
+               if (nr < 0)
+                       return -1;
+               if (nr == 0) {
+                       errno = ENOSPC;
+                       return -1;
+               }
+
+               shift_iovec(&iov, &iovcnt, nr, &total, &count, &offset);
+       }
+
+       return total;
+}
+
+#ifdef CONFIG_HAS_AIO
+int aio_pwritev(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt,
+               off_t offset, int ev, void *param)
+{
+       struct iocb *ios[1] = { iocb };
+       int ret;
+
+       io_prep_pwritev(iocb, fd, iov, iovcnt, offset);
+       io_set_eventfd(iocb, ev);
+       iocb->data = param;
+
+restart:
+       ret = io_submit(ctx, 1, ios);
+       if (ret == -EAGAIN)
+               goto restart;
+       return ret;
+}
+
+int aio_preadv(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt,
+               off_t offset, int ev, void *param)
+{
+       struct iocb *ios[1] = { iocb };
+       int ret;
+
+       io_prep_preadv(iocb, fd, iov, iovcnt, offset);
+       io_set_eventfd(iocb, ev);
+       iocb->data = param;
+
+restart:
+       ret = io_submit(ctx, 1, ios);
+       if (ret == -EAGAIN)
+               goto restart;
+       return ret;
+}
+#endif
diff --git a/tools/kvm/util/set_private_br.sh b/tools/kvm/util/set_private_br.sh

new file mode 100755 (executable)

index 0000000..49867dd
--- /dev/null
+++ b/tools/kvm/util/set_private_br.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+#
+# Author: Amos Kong <kongjianjun@gmail.com>
+# Date: Apr 14, 2011
+# Description: this script is used to create/delete a private bridge,
+# launch a dhcp server on the bridge by dnsmasq.
+#
+# @ ./set_private_br.sh $bridge_name $subnet_prefix
+# @ ./set_private_br.sh vbr0 192.168.33
+
+brname='vbr0'
+subnet='192.168.33'
+
+add_br()
+{
+    echo "add new private bridge: $brname"
+    /usr/sbin/brctl addbr $brname
+    echo 1 > /proc/sys/net/ipv6/conf/$brname/disable_ipv6
+    echo 1 > /proc/sys/net/ipv4/ip_forward
+    /usr/sbin/brctl stp $brname on
+    /usr/sbin/brctl setfd $brname 0
+    ifconfig $brname $subnet.1
+    ifconfig $brname up
+    # Add forward rule, then guest can access public network
+    iptables -t nat -A POSTROUTING -s $subnet.254/24 ! -d $subnet.254/24 -j MASQUERADE
+    /etc/init.d/dnsmasq stop
+    /etc/init.d/tftpd-hpa stop 2>/dev/null
+    dnsmasq --strict-order --bind-interfaces --listen-address $subnet.1 --dhcp-range $subnet.1,$subnet.254 $tftp_cmd
+}
+
+del_br()
+{
+    echo "cleanup bridge setup"
+    kill -9 `pgrep dnsmasq|tail -1`
+    ifconfig $brname down
+    /usr/sbin/brctl delbr $brname
+    iptables -t nat -D POSTROUTING -s $subnet.254/24 ! -d $subnet.254/24 -j MASQUERADE
+}
+
+
+if [ $# = 0 ]; then
+    del_br 2>/dev/null
+    exit
+fi
+if [ $# > 1 ]; then
+    brname="$1"
+fi
+if [ $# = 2 ]; then
+    subnet="$2"
+fi
+add_br
diff --git a/tools/kvm/util/strbuf.c b/tools/kvm/util/strbuf.c

new file mode 100644 (file)

index 0000000..99d6b0c
--- /dev/null
+++ b/tools/kvm/util/strbuf.c
@@ -0,0 +1,62 @@
+
+/* user defined headers */
+#include <kvm/util.h>
+#include <kvm/strbuf.h>
+
+int prefixcmp(const char *str, const char *prefix)
+{
+       for (; ; str++, prefix++) {
+               if (!*prefix)
+                       return 0;
+               else if (*str != *prefix)
+                       return (unsigned char)*prefix - (unsigned char)*str;
+       }
+}
+
+/**
+ * strlcat - Append a length-limited, %NUL-terminated string to another
+ * @dest: The string to be appended to
+ * @src: The string to append to it
+ * @count: The size of the destination buffer.
+ */
+size_t strlcat(char *dest, const char *src, size_t count)
+{
+       size_t dsize = strlen(dest);
+       size_t len = strlen(src);
+       size_t res = dsize + len;
+
+       DIE_IF(dsize >= count);
+
+       dest += dsize;
+       count -= dsize;
+       if (len >= count)
+               len = count - 1;
+
+       memcpy(dest, src, len);
+       dest[len] = 0;
+
+       return res;
+}
+
+/**
+ * strlcpy - Copy a %NUL terminated string into a sized buffer
+ * @dest: Where to copy the string to
+ * @src: Where to copy the string from
+ * @size: size of destination buffer
+ *
+ * Compatible with *BSD: the result is always a valid
+ * NUL-terminated string that fits in the buffer (unless,
+ * of course, the buffer size is zero). It does not pad
+ * out the result like strncpy() does.
+ */
+size_t strlcpy(char *dest, const char *src, size_t size)
+{
+       size_t ret = strlen(src);
+
+       if (size) {
+               size_t len = (ret >= size) ? size - 1 : ret;
+               memcpy(dest, src, len);
+               dest[len] = '\0';
+       }
+       return ret;
+}
diff --git a/tools/kvm/util/threadpool.c b/tools/kvm/util/threadpool.c

new file mode 100644 (file)

index 0000000..bafbcd7
--- /dev/null
+++ b/tools/kvm/util/threadpool.c
@@ -0,0 +1,145 @@
+#include "kvm/threadpool.h"
+#include "kvm/mutex.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <pthread.h>
+#include <stdbool.h>
+
+static pthread_mutex_t job_mutex       = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t thread_mutex    = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t  job_cond        = PTHREAD_COND_INITIALIZER;
+
+static LIST_HEAD(head);
+
+static pthread_t       *threads;
+static long            threadcount;
+
+static struct thread_pool__job *thread_pool__job_pop_locked(void)
+{
+       struct thread_pool__job *job;
+
+       if (list_empty(&head))
+               return NULL;
+
+       job = list_first_entry(&head, struct thread_pool__job, queue);
+       list_del(&job->queue);
+
+       return job;
+}
+
+static void thread_pool__job_push_locked(struct thread_pool__job *job)
+{
+       list_add_tail(&job->queue, &head);
+}
+
+static struct thread_pool__job *thread_pool__job_pop(void)
+{
+       struct thread_pool__job *job;
+
+       mutex_lock(&job_mutex);
+       job = thread_pool__job_pop_locked();
+       mutex_unlock(&job_mutex);
+       return job;
+}
+
+static void thread_pool__job_push(struct thread_pool__job *job)
+{
+       mutex_lock(&job_mutex);
+       thread_pool__job_push_locked(job);
+       mutex_unlock(&job_mutex);
+}
+
+static void thread_pool__handle_job(struct thread_pool__job *job)
+{
+       while (job) {
+               job->callback(job->kvm, job->data);
+
+               mutex_lock(&job->mutex);
+
+               if (--job->signalcount > 0)
+                       /* If the job was signaled again while we were working */
+                       thread_pool__job_push(job);
+
+               mutex_unlock(&job->mutex);
+
+               job = thread_pool__job_pop();
+       }
+}
+
+static void thread_pool__threadfunc_cleanup(void *param)
+{
+       mutex_unlock(&job_mutex);
+}
+
+static void *thread_pool__threadfunc(void *param)
+{
+       pthread_cleanup_push(thread_pool__threadfunc_cleanup, NULL);
+
+       for (;;) {
+               struct thread_pool__job *curjob;
+
+               mutex_lock(&job_mutex);
+               while ((curjob = thread_pool__job_pop_locked()) == NULL)
+                       pthread_cond_wait(&job_cond, &job_mutex);
+               mutex_unlock(&job_mutex);
+
+               thread_pool__handle_job(curjob);
+       }
+
+       pthread_cleanup_pop(0);
+
+       return NULL;
+}
+
+static int thread_pool__addthread(void)
+{
+       int res;
+       void *newthreads;
+
+       mutex_lock(&thread_mutex);
+       newthreads = realloc(threads, (threadcount + 1) * sizeof(pthread_t));
+       if (newthreads == NULL) {
+               mutex_unlock(&thread_mutex);
+               return -1;
+       }
+
+       threads = newthreads;
+
+       res = pthread_create(threads + threadcount, NULL,
+                            thread_pool__threadfunc, NULL);
+
+       if (res == 0)
+               threadcount++;
+       mutex_unlock(&thread_mutex);
+
+       return res;
+}
+
+int thread_pool__init(unsigned long thread_count)
+{
+       unsigned long i;
+
+       for (i = 0; i < thread_count; i++)
+               if (thread_pool__addthread() < 0)
+                       return i;
+
+       return i;
+}
+
+void thread_pool__do_job(struct thread_pool__job *job)
+{
+       struct thread_pool__job *jobinfo = job;
+
+       if (jobinfo == NULL || jobinfo->callback == NULL)
+               return;
+
+       mutex_lock(&jobinfo->mutex);
+       if (jobinfo->signalcount++ == 0)
+               thread_pool__job_push(job);
+       mutex_unlock(&jobinfo->mutex);
+
+       mutex_lock(&job_mutex);
+       pthread_cond_signal(&job_cond);
+       mutex_unlock(&job_mutex);
+}
diff --git a/tools/kvm/util/util.c b/tools/kvm/util/util.c

new file mode 100644 (file)

index 0000000..c11a15a
--- /dev/null
+++ b/tools/kvm/util/util.c
@@ -0,0 +1,133 @@
+/*
+ * Taken from perf which in turn take it from GIT
+ */
+
+#include "kvm/util.h"
+
+#include <kvm/kvm.h>
+#include <linux/magic.h>       /* For HUGETLBFS_MAGIC */
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+
+static void report(const char *prefix, const char *err, va_list params)
+{
+       char msg[1024];
+       vsnprintf(msg, sizeof(msg), err, params);
+       fprintf(stderr, " %s%s\n", prefix, msg);
+}
+
+static NORETURN void die_builtin(const char *err, va_list params)
+{
+       report(" Fatal: ", err, params);
+       exit(128);
+}
+
+static void error_builtin(const char *err, va_list params)
+{
+       report(" Error: ", err, params);
+}
+
+static void warn_builtin(const char *warn, va_list params)
+{
+       report(" Warning: ", warn, params);
+}
+
+static void info_builtin(const char *info, va_list params)
+{
+       report(" Info: ", info, params);
+}
+
+void die(const char *err, ...)
+{
+       va_list params;
+
+       va_start(params, err);
+       die_builtin(err, params);
+       va_end(params);
+}
+
+int pr_err(const char *err, ...)
+{
+       va_list params;
+
+       va_start(params, err);
+       error_builtin(err, params);
+       va_end(params);
+       return -1;
+}
+
+void pr_warning(const char *warn, ...)
+{
+       va_list params;
+
+       va_start(params, warn);
+       warn_builtin(warn, params);
+       va_end(params);
+}
+
+void pr_info(const char *info, ...)
+{
+       va_list params;
+
+       va_start(params, info);
+       info_builtin(info, params);
+       va_end(params);
+}
+
+void die_perror(const char *s)
+{
+       perror(s);
+       exit(1);
+}
+
+void *mmap_hugetlbfs(struct kvm *kvm, const char *htlbfs_path, u64 size)
+{
+       char mpath[PATH_MAX];
+       int fd;
+       struct statfs sfs;
+       void *addr;
+       unsigned long blk_size;
+
+       if (statfs(htlbfs_path, &sfs) < 0)
+               die("Can't stat %s\n", htlbfs_path);
+
+       if ((unsigned int)sfs.f_type != HUGETLBFS_MAGIC)
+               die("%s is not hugetlbfs!\n", htlbfs_path);
+
+       blk_size = (unsigned long)sfs.f_bsize;
+       if (sfs.f_bsize == 0 || blk_size > size) {
+               die("Can't use hugetlbfs pagesize %ld for mem size %lld\n",
+                   blk_size, size);
+       }
+
+       kvm->ram_pagesize = blk_size;
+
+       snprintf(mpath, PATH_MAX, "%s/kvmtoolXXXXXX", htlbfs_path);
+       fd = mkstemp(mpath);
+       if (fd < 0)
+               die("Can't open %s for hugetlbfs map\n", mpath);
+       unlink(mpath);
+       if (ftruncate(fd, size) < 0)
+               die("Can't ftruncate for mem mapping size %lld\n",
+                   size);
+       addr = mmap(NULL, size, PROT_RW, MAP_PRIVATE, fd, 0);
+       close(fd);
+
+       return addr;
+}
+
+/* This function wraps the decision between hugetlbfs map (if requested) or normal mmap */
+void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 size)
+{
+       if (hugetlbfs_path)
+               /*
+                * We don't /need/ to map guest RAM from hugetlbfs, but we do so
+                * if the user specifies a hugetlbfs path.
+                */
+               return mmap_hugetlbfs(kvm, hugetlbfs_path, size);
+       else {
+               kvm->ram_pagesize = getpagesize();
+               return mmap(NULL, size, PROT_RW, MAP_ANON_NORESERVE, -1, 0);
+       }
+}
diff --git a/tools/kvm/virtio/9p-pdu.c b/tools/kvm/virtio/9p-pdu.c

new file mode 100644 (file)

index 0000000..b9ce8ce
--- /dev/null
+++ b/tools/kvm/virtio/9p-pdu.c
@@ -0,0 +1,287 @@
+#include "kvm/util.h"
+#include "kvm/virtio-9p.h"
+
+#include <endian.h>
+#include <stdint.h>
+
+#include <linux/compiler.h>
+#include <net/9p/9p.h>
+
+static void virtio_p9_pdu_read(struct p9_pdu *pdu, void *data, size_t size)
+{
+       size_t len;
+       int i, copied = 0;
+       u16 iov_cnt = pdu->out_iov_cnt;
+       size_t offset = pdu->read_offset;
+       struct iovec *iov = pdu->out_iov;
+
+       for (i = 0; i < iov_cnt && size; i++) {
+               if (offset >= iov[i].iov_len) {
+                       offset -= iov[i].iov_len;
+                       continue;
+               } else {
+                       len = MIN(iov[i].iov_len - offset, size);
+                       memcpy(data, iov[i].iov_base + offset, len);
+                       size -= len;
+                       data += len;
+                       offset = 0;
+                       copied += len;
+               }
+       }
+       pdu->read_offset += copied;
+}
+
+static void virtio_p9_pdu_write(struct p9_pdu *pdu,
+                               const void *data, size_t size)
+{
+       size_t len;
+       int i, copied = 0;
+       u16 iov_cnt = pdu->in_iov_cnt;
+       size_t offset = pdu->write_offset;
+       struct iovec *iov = pdu->in_iov;
+
+       for (i = 0; i < iov_cnt && size; i++) {
+               if (offset >= iov[i].iov_len) {
+                       offset -= iov[i].iov_len;
+                       continue;
+               } else {
+                       len = MIN(iov[i].iov_len - offset, size);
+                       memcpy(iov[i].iov_base + offset, data, len);
+                       size -= len;
+                       data += len;
+                       offset = 0;
+                       copied += len;
+               }
+       }
+       pdu->write_offset += copied;
+}
+
+static void virtio_p9_wstat_free(struct p9_wstat *stbuf)
+{
+       free(stbuf->name);
+       free(stbuf->uid);
+       free(stbuf->gid);
+       free(stbuf->muid);
+}
+
+static int virtio_p9_decode(struct p9_pdu *pdu, const char *fmt, va_list ap)
+{
+       int retval = 0;
+       const char *ptr;
+
+       for (ptr = fmt; *ptr; ptr++) {
+               switch (*ptr) {
+               case 'b':
+               {
+                       int8_t *val = va_arg(ap, int8_t *);
+                       virtio_p9_pdu_read(pdu, val, sizeof(*val));
+               }
+               break;
+               case 'w':
+               {
+                       int16_t le_val;
+                       int16_t *val = va_arg(ap, int16_t *);
+                       virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val));
+                       *val = le16toh(le_val);
+               }
+               break;
+               case 'd':
+               {
+                       int32_t le_val;
+                       int32_t *val = va_arg(ap, int32_t *);
+                       virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val));
+                       *val = le32toh(le_val);
+               }
+               break;
+               case 'q':
+               {
+                       int64_t le_val;
+                       int64_t *val = va_arg(ap, int64_t *);
+                       virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val));
+                       *val = le64toh(le_val);
+               }
+               break;
+               case 's':
+               {
+                       int16_t len;
+                       char **str = va_arg(ap, char **);
+
+                       virtio_p9_pdu_readf(pdu, "w", &len);
+                       *str = malloc(len + 1);
+                       if (*str == NULL) {
+                               retval = ENOMEM;
+                               break;
+                       }
+                       virtio_p9_pdu_read(pdu, *str, len);
+                       (*str)[len] = 0;
+               }
+               break;
+               case 'Q':
+               {
+                       struct p9_qid *qid = va_arg(ap, struct p9_qid *);
+                       retval = virtio_p9_pdu_readf(pdu, "bdq",
+                                                    &qid->type, &qid->version,
+                                                    &qid->path);
+               }
+               break;
+               case 'S':
+               {
+                       struct p9_wstat *stbuf = va_arg(ap, struct p9_wstat *);
+                       memset(stbuf, 0, sizeof(struct p9_wstat));
+                       stbuf->n_uid = stbuf->n_gid = stbuf->n_muid = -1;
+                       retval = virtio_p9_pdu_readf(pdu, "wwdQdddqssss",
+                                               &stbuf->size, &stbuf->type,
+                                               &stbuf->dev, &stbuf->qid,
+                                               &stbuf->mode, &stbuf->atime,
+                                               &stbuf->mtime, &stbuf->length,
+                                               &stbuf->name, &stbuf->uid,
+                                               &stbuf->gid, &stbuf->muid);
+                       if (retval)
+                               virtio_p9_wstat_free(stbuf);
+               }
+               break;
+               case 'I':
+               {
+                       struct p9_iattr_dotl *p9attr = va_arg(ap,
+                                                      struct p9_iattr_dotl *);
+
+                       retval = virtio_p9_pdu_readf(pdu, "ddddqqqqq",
+                                                    &p9attr->valid,
+                                                    &p9attr->mode,
+                                                    &p9attr->uid,
+                                                    &p9attr->gid,
+                                                    &p9attr->size,
+                                                    &p9attr->atime_sec,
+                                                    &p9attr->atime_nsec,
+                                                    &p9attr->mtime_sec,
+                                                    &p9attr->mtime_nsec);
+               }
+               break;
+               default:
+                       retval = EINVAL;
+                       break;
+               }
+       }
+       return retval;
+}
+
+static int virtio_p9_pdu_encode(struct p9_pdu *pdu, const char *fmt, va_list ap)
+{
+       int retval = 0;
+       const char *ptr;
+
+       for (ptr = fmt; *ptr; ptr++) {
+               switch (*ptr) {
+               case 'b':
+               {
+                       int8_t val = va_arg(ap, int);
+                       virtio_p9_pdu_write(pdu, &val, sizeof(val));
+               }
+               break;
+               case 'w':
+               {
+                       int16_t val = htole16(va_arg(ap, int));
+                       virtio_p9_pdu_write(pdu, &val, sizeof(val));
+               }
+               break;
+               case 'd':
+               {
+                       int32_t val = htole32(va_arg(ap, int32_t));
+                       virtio_p9_pdu_write(pdu, &val, sizeof(val));
+               }
+               break;
+               case 'q':
+               {
+                       int64_t val = htole64(va_arg(ap, int64_t));
+                       virtio_p9_pdu_write(pdu, &val, sizeof(val));
+               }
+               break;
+               case 's':
+               {
+                       uint16_t len = 0;
+                       const char *s = va_arg(ap, char *);
+                       if (s)
+                               len = MIN(strlen(s), USHRT_MAX);
+                       virtio_p9_pdu_writef(pdu, "w", len);
+                       virtio_p9_pdu_write(pdu, s, len);
+               }
+               break;
+               case 'Q':
+               {
+                       struct p9_qid *qid = va_arg(ap, struct p9_qid *);
+                       retval = virtio_p9_pdu_writef(pdu, "bdq",
+                                                     qid->type, qid->version,
+                                                     qid->path);
+               }
+               break;
+               case 'S':
+               {
+                       struct p9_wstat *stbuf = va_arg(ap, struct p9_wstat *);
+                       retval = virtio_p9_pdu_writef(pdu, "wwdQdddqssss",
+                                               stbuf->size, stbuf->type,
+                                               stbuf->dev, &stbuf->qid,
+                                               stbuf->mode, stbuf->atime,
+                                               stbuf->mtime, stbuf->length,
+                                               stbuf->name, stbuf->uid,
+                                               stbuf->gid, stbuf->muid);
+               }
+               break;
+               case 'A':
+               {
+                       struct p9_stat_dotl *stbuf = va_arg(ap,
+                                                     struct p9_stat_dotl *);
+                       retval  = virtio_p9_pdu_writef(pdu,
+                                                      "qQdddqqqqqqqqqqqqqqq",
+                                                      stbuf->st_result_mask,
+                                                      &stbuf->qid,
+                                                      stbuf->st_mode,
+                                                      stbuf->st_uid,
+                                                      stbuf->st_gid,
+                                                      stbuf->st_nlink,
+                                                      stbuf->st_rdev,
+                                                      stbuf->st_size,
+                                                      stbuf->st_blksize,
+                                                      stbuf->st_blocks,
+                                                      stbuf->st_atime_sec,
+                                                      stbuf->st_atime_nsec,
+                                                      stbuf->st_mtime_sec,
+                                                      stbuf->st_mtime_nsec,
+                                                      stbuf->st_ctime_sec,
+                                                      stbuf->st_ctime_nsec,
+                                                      stbuf->st_btime_sec,
+                                                      stbuf->st_btime_nsec,
+                                                      stbuf->st_gen,
+                                                      stbuf->st_data_version);
+               }
+               break;
+               default:
+                       retval = EINVAL;
+                       break;
+               }
+       }
+       return retval;
+}
+
+int virtio_p9_pdu_readf(struct p9_pdu *pdu, const char *fmt, ...)
+{
+       int ret;
+       va_list ap;
+
+       va_start(ap, fmt);
+       ret = virtio_p9_decode(pdu, fmt, ap);
+       va_end(ap);
+
+       return ret;
+}
+
+int virtio_p9_pdu_writef(struct p9_pdu *pdu, const char *fmt, ...)
+{
+       int ret;
+       va_list ap;
+
+       va_start(ap, fmt);
+       ret = virtio_p9_pdu_encode(pdu, fmt, ap);
+       va_end(ap);
+
+       return ret;
+}
diff --git a/tools/kvm/virtio/9p.c b/tools/kvm/virtio/9p.c

new file mode 100644 (file)

index 0000000..c3f5280
--- /dev/null
+++ b/tools/kvm/virtio/9p.c
@@ -0,0 +1,1379 @@
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/threadpool.h"
+#include "kvm/irq.h"
+#include "kvm/virtio-9p.h"
+#include "kvm/guest_compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/vfs.h>
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_9p.h>
+#include <net/9p/9p.h>
+
+static LIST_HEAD(devs);
+static int compat_id = -1;
+
+static int insert_new_fid(struct p9_dev *dev, struct p9_fid *fid);
+static struct p9_fid *find_or_create_fid(struct p9_dev *dev, u32 fid)
+{
+       struct rb_node *node = dev->fids.rb_node;
+       struct p9_fid *pfid = NULL;
+
+       while (node) {
+               struct p9_fid *cur = rb_entry(node, struct p9_fid, node);
+
+               if (fid < cur->fid) {
+                       node = node->rb_left;
+               } else if (fid > cur->fid) {
+                       node = node->rb_right;
+               } else {
+                       return cur;
+               }
+       }
+
+       pfid = calloc(sizeof(*pfid), 1);
+       if (!pfid)
+               return NULL;
+
+       pfid->fid = fid;
+       strcpy(pfid->abs_path, dev->root_dir);
+       pfid->path = pfid->abs_path + strlen(dev->root_dir);
+
+       insert_new_fid(dev, pfid);
+
+       return pfid;
+}
+
+static int insert_new_fid(struct p9_dev *dev, struct p9_fid *fid)
+{
+       struct rb_node **node = &(dev->fids.rb_node), *parent = NULL;
+
+       while (*node) {
+               int result = fid->fid - rb_entry(*node, struct p9_fid, node)->fid;
+
+               parent = *node;
+               if (result < 0)
+                       node    = &((*node)->rb_left);
+               else if (result > 0)
+                       node    = &((*node)->rb_right);
+               else
+                       return -EEXIST;
+       }
+
+       rb_link_node(&fid->node, parent, node);
+       rb_insert_color(&fid->node, &dev->fids);
+       return 0;
+}
+
+static struct p9_fid *get_fid(struct p9_dev *p9dev, int fid)
+{
+       struct p9_fid *new;
+
+       new = find_or_create_fid(p9dev, fid);
+
+       return new;
+}
+
+/* Warning: Immediately use value returned from this function */
+static const char *rel_to_abs(struct p9_dev *p9dev,
+                             const char *path, char *abs_path)
+{
+       sprintf(abs_path, "%s/%s", p9dev->root_dir, path);
+
+       return abs_path;
+}
+
+static void stat2qid(struct stat *st, struct p9_qid *qid)
+{
+       *qid = (struct p9_qid) {
+               .path           = st->st_ino,
+               .version        = st->st_mtime,
+       };
+
+       if (S_ISDIR(st->st_mode))
+               qid->type       |= P9_QTDIR;
+}
+
+static void close_fid(struct p9_dev *p9dev, u32 fid)
+{
+       struct p9_fid *pfid = get_fid(p9dev, fid);
+
+       if (pfid->fd > 0)
+               close(pfid->fd);
+
+       if (pfid->dir)
+               closedir(pfid->dir);
+
+       rb_erase(&pfid->node, &p9dev->fids);
+       free(pfid);
+}
+
+static void virtio_p9_set_reply_header(struct p9_pdu *pdu, u32 size)
+{
+       u8 cmd;
+       u16 tag;
+
+       pdu->read_offset = sizeof(u32);
+       virtio_p9_pdu_readf(pdu, "bw", &cmd, &tag);
+       pdu->write_offset = 0;
+       /* cmd + 1 is the reply message */
+       virtio_p9_pdu_writef(pdu, "dbw", size, cmd + 1, tag);
+}
+
+static u16 virtio_p9_update_iov_cnt(struct iovec iov[], u32 count, int iov_cnt)
+{
+       int i;
+       u32 total = 0;
+       for (i = 0; (i < iov_cnt) && (total < count); i++) {
+               if (total + iov[i].iov_len > count) {
+                       /* we don't need this iov fully */
+                       iov[i].iov_len -= ((total + iov[i].iov_len) - count);
+                       i++;
+                       break;
+               }
+               total += iov[i].iov_len;
+       }
+       return i;
+}
+
+static void virtio_p9_error_reply(struct p9_dev *p9dev,
+                                 struct p9_pdu *pdu, int err, u32 *outlen)
+{
+       u16 tag;
+
+       pdu->write_offset = VIRTIO_9P_HDR_LEN;
+       virtio_p9_pdu_writef(pdu, "d", err);
+       *outlen = pdu->write_offset;
+
+       /* read the tag from input */
+       pdu->read_offset = sizeof(u32) + sizeof(u8);
+       virtio_p9_pdu_readf(pdu, "w", &tag);
+
+       /* Update the header */
+       pdu->write_offset = 0;
+       virtio_p9_pdu_writef(pdu, "dbw", *outlen, P9_RLERROR, tag);
+}
+
+static void virtio_p9_version(struct p9_dev *p9dev,
+                             struct p9_pdu *pdu, u32 *outlen)
+{
+       u32 msize;
+       char *version;
+       virtio_p9_pdu_readf(pdu, "ds", &msize, &version);
+       /*
+        * reply with the same msize the client sent us
+        * Error out if the request is not for 9P2000.L
+        */
+       if (!strcmp(version, VIRTIO_9P_VERSION_DOTL))
+               virtio_p9_pdu_writef(pdu, "ds", msize, version);
+       else
+               virtio_p9_pdu_writef(pdu, "ds", msize, "unknown");
+
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       free(version);
+       return;
+}
+
+static void virtio_p9_clunk(struct p9_dev *p9dev,
+                           struct p9_pdu *pdu, u32 *outlen)
+{
+       u32 fid;
+
+       virtio_p9_pdu_readf(pdu, "d", &fid);
+       close_fid(p9dev, fid);
+
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+}
+
+/*
+ * FIXME!! Need to map to protocol independent value. Upstream
+ * 9p also have the same BUG
+ */
+static int virtio_p9_openflags(int flags)
+{
+       flags &= ~(O_NOCTTY | O_ASYNC | O_CREAT | O_DIRECT);
+       flags |= O_NOFOLLOW;
+       return flags;
+}
+
+static bool is_dir(struct p9_fid *fid)
+{
+       struct stat st;
+
+       stat(fid->abs_path, &st);
+
+       return S_ISDIR(st.st_mode);
+}
+
+static void virtio_p9_open(struct p9_dev *p9dev,
+                          struct p9_pdu *pdu, u32 *outlen)
+{
+       u32 fid, flags;
+       struct stat st;
+       struct p9_qid qid;
+       struct p9_fid *new_fid;
+
+
+       virtio_p9_pdu_readf(pdu, "dd", &fid, &flags);
+       new_fid = get_fid(p9dev, fid);
+
+       if (lstat(new_fid->abs_path, &st) < 0)
+               goto err_out;
+
+       stat2qid(&st, &qid);
+
+       if (is_dir(new_fid)) {
+               new_fid->dir = opendir(new_fid->abs_path);
+               if (!new_fid->dir)
+                       goto err_out;
+       } else {
+               new_fid->fd  = open(new_fid->abs_path,
+                                   virtio_p9_openflags(flags));
+               if (new_fid->fd < 0)
+                       goto err_out;
+       }
+       /* FIXME!! need ot send proper iounit  */
+       virtio_p9_pdu_writef(pdu, "Qd", &qid, 0);
+
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_create(struct p9_dev *p9dev,
+                            struct p9_pdu *pdu, u32 *outlen)
+{
+       int fd, ret;
+       char *name;
+       struct stat st;
+       struct p9_qid qid;
+       struct p9_fid *dfid;
+       char full_path[PATH_MAX];
+       u32 dfid_val, flags, mode, gid;
+
+       virtio_p9_pdu_readf(pdu, "dsddd", &dfid_val,
+                           &name, &flags, &mode, &gid);
+       dfid = get_fid(p9dev, dfid_val);
+
+       flags = virtio_p9_openflags(flags);
+
+       sprintf(full_path, "%s/%s", dfid->abs_path, name);
+       fd = open(full_path, flags | O_CREAT, mode);
+       if (fd < 0)
+               goto err_out;
+       dfid->fd = fd;
+
+       if (lstat(full_path, &st) < 0)
+               goto err_out;
+
+       ret = chmod(full_path, mode & 0777);
+       if (ret < 0)
+               goto err_out;
+
+       ret = lchown(full_path, dfid->uid, gid);
+       if (ret < 0)
+               goto err_out;
+
+       sprintf(dfid->path, "%s/%s", dfid->path, name);
+       stat2qid(&st, &qid);
+       virtio_p9_pdu_writef(pdu, "Qd", &qid, 0);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       free(name);
+       return;
+err_out:
+       free(name);
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_mkdir(struct p9_dev *p9dev,
+                           struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret;
+       char *name;
+       struct stat st;
+       struct p9_qid qid;
+       struct p9_fid *dfid;
+       char full_path[PATH_MAX];
+       u32 dfid_val, mode, gid;
+
+       virtio_p9_pdu_readf(pdu, "dsdd", &dfid_val,
+                           &name, &mode, &gid);
+       dfid = get_fid(p9dev, dfid_val);
+
+       sprintf(full_path, "%s/%s", dfid->abs_path, name);
+       ret = mkdir(full_path, mode);
+       if (ret < 0)
+               goto err_out;
+
+       if (lstat(full_path, &st) < 0)
+               goto err_out;
+
+       ret = chmod(full_path, mode & 0777);
+       if (ret < 0)
+               goto err_out;
+
+       ret = lchown(full_path, dfid->uid, gid);
+       if (ret < 0)
+               goto err_out;
+
+       stat2qid(&st, &qid);
+       virtio_p9_pdu_writef(pdu, "Qd", &qid, 0);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       free(name);
+       return;
+err_out:
+       free(name);
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_walk(struct p9_dev *p9dev,
+                          struct p9_pdu *pdu, u32 *outlen)
+{
+       u8 i;
+       u16 nwqid;
+       u16 nwname;
+       struct p9_qid wqid;
+       struct p9_fid *new_fid, *old_fid;
+       u32 fid_val, newfid_val;
+
+
+       virtio_p9_pdu_readf(pdu, "ddw", &fid_val, &newfid_val, &nwname);
+       new_fid = get_fid(p9dev, newfid_val);
+
+       nwqid = 0;
+       if (nwname) {
+               struct p9_fid *fid = get_fid(p9dev, fid_val);
+
+               strcpy(new_fid->path, fid->path);
+               /* skip the space for count */
+               pdu->write_offset += sizeof(u16);
+               for (i = 0; i < nwname; i++) {
+                       struct stat st;
+                       char tmp[PATH_MAX] = {0};
+                       char full_path[PATH_MAX];
+                       char *str;
+
+                       virtio_p9_pdu_readf(pdu, "s", &str);
+
+                       /* Format the new path we're 'walk'ing into */
+                       sprintf(tmp, "%s/%s", new_fid->path, str);
+
+                       free(str);
+
+                       if (lstat(rel_to_abs(p9dev, tmp, full_path), &st) < 0)
+                               goto err_out;
+
+                       stat2qid(&st, &wqid);
+                       strcpy(new_fid->path, tmp);
+                       new_fid->uid = fid->uid;
+                       nwqid++;
+                       virtio_p9_pdu_writef(pdu, "Q", &wqid);
+               }
+       } else {
+               /*
+                * update write_offset so our outlen get correct value
+                */
+               pdu->write_offset += sizeof(u16);
+               old_fid = get_fid(p9dev, fid_val);
+               strcpy(new_fid->path, old_fid->path);
+               new_fid->uid    = old_fid->uid;
+       }
+       *outlen = pdu->write_offset;
+       pdu->write_offset = VIRTIO_9P_HDR_LEN;
+       virtio_p9_pdu_writef(pdu, "d", nwqid);
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_attach(struct p9_dev *p9dev,
+                            struct p9_pdu *pdu, u32 *outlen)
+{
+       char *uname;
+       char *aname;
+       struct stat st;
+       struct p9_qid qid;
+       struct p9_fid *fid;
+       u32 fid_val, afid, uid;
+
+       virtio_p9_pdu_readf(pdu, "ddssd", &fid_val, &afid,
+                           &uname, &aname, &uid);
+
+       free(uname);
+       free(aname);
+
+       if (lstat(p9dev->root_dir, &st) < 0)
+               goto err_out;
+
+       stat2qid(&st, &qid);
+
+       fid = get_fid(p9dev, fid_val);
+       fid->uid = uid;
+       strcpy(fid->path, "/");
+
+       virtio_p9_pdu_writef(pdu, "Q", &qid);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_fill_stat(struct p9_dev *p9dev,
+                               struct stat *st, struct p9_stat_dotl *statl)
+{
+       memset(statl, 0, sizeof(*statl));
+       statl->st_mode          = st->st_mode;
+       statl->st_nlink         = st->st_nlink;
+       statl->st_uid           = st->st_uid;
+       statl->st_gid           = st->st_gid;
+       statl->st_rdev          = st->st_rdev;
+       statl->st_size          = st->st_size;
+       statl->st_blksize       = st->st_blksize;
+       statl->st_blocks        = st->st_blocks;
+       statl->st_atime_sec     = st->st_atime;
+       statl->st_atime_nsec    = st->st_atim.tv_nsec;
+       statl->st_mtime_sec     = st->st_mtime;
+       statl->st_mtime_nsec    = st->st_mtim.tv_nsec;
+       statl->st_ctime_sec     = st->st_ctime;
+       statl->st_ctime_nsec    = st->st_ctim.tv_nsec;
+       /* Currently we only support BASIC fields in stat */
+       statl->st_result_mask   = P9_STATS_BASIC;
+       stat2qid(st, &statl->qid);
+}
+
+static void virtio_p9_read(struct p9_dev *p9dev,
+                          struct p9_pdu *pdu, u32 *outlen)
+{
+       u64 offset;
+       u32 fid_val;
+       u16 iov_cnt;
+       void *iov_base;
+       size_t iov_len;
+       u32 count, rcount;
+       struct p9_fid *fid;
+
+
+       rcount = 0;
+       virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count);
+       fid = get_fid(p9dev, fid_val);
+
+       iov_base = pdu->in_iov[0].iov_base;
+       iov_len  = pdu->in_iov[0].iov_len;
+       iov_cnt  = pdu->in_iov_cnt;
+       pdu->in_iov[0].iov_base += VIRTIO_9P_HDR_LEN + sizeof(u32);
+       pdu->in_iov[0].iov_len -= VIRTIO_9P_HDR_LEN + sizeof(u32);
+       pdu->in_iov_cnt = virtio_p9_update_iov_cnt(pdu->in_iov,
+                                                  count,
+                                                  pdu->in_iov_cnt);
+       rcount = preadv(fid->fd, pdu->in_iov,
+                       pdu->in_iov_cnt, offset);
+       if (rcount > count)
+               rcount = count;
+       /*
+        * Update the iov_base back, so that rest of
+        * pdu_writef works correctly.
+        */
+       pdu->in_iov[0].iov_base = iov_base;
+       pdu->in_iov[0].iov_len  = iov_len;
+       pdu->in_iov_cnt         = iov_cnt;
+
+       pdu->write_offset = VIRTIO_9P_HDR_LEN;
+       virtio_p9_pdu_writef(pdu, "d", rcount);
+       *outlen = pdu->write_offset + rcount;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+}
+
+static int virtio_p9_dentry_size(struct dirent *dent)
+{
+       /*
+        * Size of each dirent:
+        * qid(13) + offset(8) + type(1) + name_len(2) + name
+        */
+       return 24 + strlen(dent->d_name);
+}
+
+static void virtio_p9_readdir(struct p9_dev *p9dev,
+                             struct p9_pdu *pdu, u32 *outlen)
+{
+       u32 fid_val;
+       u32 count, rcount;
+       struct stat st;
+       struct p9_fid *fid;
+       struct dirent *dent;
+       char full_path[PATH_MAX];
+       u64 offset, old_offset;
+
+       rcount = 0;
+       virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count);
+       fid = get_fid(p9dev, fid_val);
+
+       if (!is_dir(fid)) {
+               errno = EINVAL;
+               goto err_out;
+       }
+
+       /* Move the offset specified */
+       seekdir(fid->dir, offset);
+
+       old_offset = offset;
+       /* If reading a dir, fill the buffer with p9_stat entries */
+       dent = readdir(fid->dir);
+
+       /* Skip the space for writing count */
+       pdu->write_offset += sizeof(u32);
+       while (dent) {
+               u32 read;
+               struct p9_qid qid;
+
+               if ((rcount + virtio_p9_dentry_size(dent)) > count) {
+                       /* seek to the previous offset and return */
+                       seekdir(fid->dir, old_offset);
+                       break;
+               }
+               old_offset = dent->d_off;
+               lstat(rel_to_abs(p9dev, dent->d_name, full_path), &st);
+               stat2qid(&st, &qid);
+               read = pdu->write_offset;
+               virtio_p9_pdu_writef(pdu, "Qqbs", &qid, dent->d_off,
+                                    dent->d_type, dent->d_name);
+               rcount += pdu->write_offset - read;
+               dent = readdir(fid->dir);
+       }
+
+       pdu->write_offset = VIRTIO_9P_HDR_LEN;
+       virtio_p9_pdu_writef(pdu, "d", rcount);
+       *outlen = pdu->write_offset + rcount;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+
+static void virtio_p9_getattr(struct p9_dev *p9dev,
+                             struct p9_pdu *pdu, u32 *outlen)
+{
+       u32 fid_val;
+       struct stat st;
+       u64 request_mask;
+       struct p9_fid *fid;
+       struct p9_stat_dotl statl;
+
+       virtio_p9_pdu_readf(pdu, "dq", &fid_val, &request_mask);
+       fid = get_fid(p9dev, fid_val);
+       if (lstat(fid->abs_path, &st) < 0)
+               goto err_out;
+
+       virtio_p9_fill_stat(p9dev, &st, &statl);
+       virtio_p9_pdu_writef(pdu, "A", &statl);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+/* FIXME!! from linux/fs.h */
+/*
+ * Attribute flags.  These should be or-ed together to figure out what
+ * has been changed!
+ */
+#define ATTR_MODE      (1 << 0)
+#define ATTR_UID       (1 << 1)
+#define ATTR_GID       (1 << 2)
+#define ATTR_SIZE      (1 << 3)
+#define ATTR_ATIME     (1 << 4)
+#define ATTR_MTIME     (1 << 5)
+#define ATTR_CTIME     (1 << 6)
+#define ATTR_ATIME_SET (1 << 7)
+#define ATTR_MTIME_SET (1 << 8)
+#define ATTR_FORCE     (1 << 9) /* Not a change, but a change it */
+#define ATTR_ATTR_FLAG (1 << 10)
+#define ATTR_KILL_SUID (1 << 11)
+#define ATTR_KILL_SGID (1 << 12)
+#define ATTR_FILE      (1 << 13)
+#define ATTR_KILL_PRIV (1 << 14)
+#define ATTR_OPEN      (1 << 15) /* Truncating from open(O_TRUNC) */
+#define ATTR_TIMES_SET (1 << 16)
+
+#define ATTR_MASK    127
+
+static void virtio_p9_setattr(struct p9_dev *p9dev,
+                             struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret = 0;
+       u32 fid_val;
+       struct p9_fid *fid;
+       struct p9_iattr_dotl p9attr;
+
+       virtio_p9_pdu_readf(pdu, "dI", &fid_val, &p9attr);
+       fid = get_fid(p9dev, fid_val);
+
+       if (p9attr.valid & ATTR_MODE) {
+               ret = chmod(fid->abs_path, p9attr.mode);
+               if (ret < 0)
+                       goto err_out;
+       }
+       if (p9attr.valid & (ATTR_ATIME | ATTR_MTIME)) {
+               struct timespec times[2];
+               if (p9attr.valid & ATTR_ATIME) {
+                       if (p9attr.valid & ATTR_ATIME_SET) {
+                               times[0].tv_sec = p9attr.atime_sec;
+                               times[0].tv_nsec = p9attr.atime_nsec;
+                       } else {
+                               times[0].tv_nsec = UTIME_NOW;
+                       }
+               } else {
+                       times[0].tv_nsec = UTIME_OMIT;
+               }
+               if (p9attr.valid & ATTR_MTIME) {
+                       if (p9attr.valid & ATTR_MTIME_SET) {
+                               times[1].tv_sec = p9attr.mtime_sec;
+                               times[1].tv_nsec = p9attr.mtime_nsec;
+                       } else {
+                               times[1].tv_nsec = UTIME_NOW;
+                       }
+               } else
+                       times[1].tv_nsec = UTIME_OMIT;
+
+               ret = utimensat(-1, fid->abs_path, times, AT_SYMLINK_NOFOLLOW);
+               if (ret < 0)
+                       goto err_out;
+       }
+       /*
+        * If the only valid entry in iattr is ctime we can call
+        * chown(-1,-1) to update the ctime of the file
+        */
+       if ((p9attr.valid & (ATTR_UID | ATTR_GID)) ||
+           ((p9attr.valid & ATTR_CTIME)
+            && !((p9attr.valid & ATTR_MASK) & ~ATTR_CTIME))) {
+               if (!(p9attr.valid & ATTR_UID))
+                       p9attr.uid = -1;
+
+               if (!(p9attr.valid & ATTR_GID))
+                       p9attr.gid = -1;
+
+               ret = lchown(fid->abs_path, p9attr.uid, p9attr.gid);
+               if (ret < 0)
+                       goto err_out;
+       }
+       if (p9attr.valid & (ATTR_SIZE)) {
+               ret = truncate(fid->abs_path, p9attr.size);
+               if (ret < 0)
+                       goto err_out;
+       }
+       *outlen = VIRTIO_9P_HDR_LEN;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_write(struct p9_dev *p9dev,
+                           struct p9_pdu *pdu, u32 *outlen)
+{
+
+       u64 offset;
+       u32 fid_val;
+       u32 count;
+       ssize_t res;
+       u16 iov_cnt;
+       void *iov_base;
+       size_t iov_len;
+       struct p9_fid *fid;
+       /* u32 fid + u64 offset + u32 count */
+       int twrite_size = sizeof(u32) + sizeof(u64) + sizeof(u32);
+
+       virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count);
+       fid = get_fid(p9dev, fid_val);
+
+       iov_base = pdu->out_iov[0].iov_base;
+       iov_len  = pdu->out_iov[0].iov_len;
+       iov_cnt  = pdu->out_iov_cnt;
+
+       /* Adjust the iovec to skip the header and meta data */
+       pdu->out_iov[0].iov_base += (sizeof(struct p9_msg) + twrite_size);
+       pdu->out_iov[0].iov_len -=  (sizeof(struct p9_msg) + twrite_size);
+       pdu->out_iov_cnt = virtio_p9_update_iov_cnt(pdu->out_iov, count,
+                                                   pdu->out_iov_cnt);
+       res = pwritev(fid->fd, pdu->out_iov, pdu->out_iov_cnt, offset);
+       /*
+        * Update the iov_base back, so that rest of
+        * pdu_readf works correctly.
+        */
+       pdu->out_iov[0].iov_base = iov_base;
+       pdu->out_iov[0].iov_len  = iov_len;
+       pdu->out_iov_cnt         = iov_cnt;
+
+       if (res < 0)
+               goto err_out;
+       virtio_p9_pdu_writef(pdu, "d", res);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_remove(struct p9_dev *p9dev,
+                            struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret;
+       u32 fid_val;
+       struct p9_fid *fid;
+
+       virtio_p9_pdu_readf(pdu, "d", &fid_val);
+       fid = get_fid(p9dev, fid_val);
+
+       ret = remove(fid->abs_path);
+       if (ret < 0)
+               goto err_out;
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_rename(struct p9_dev *p9dev,
+                            struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret;
+       u32 fid_val, new_fid_val;
+       struct p9_fid *fid, *new_fid;
+       char full_path[PATH_MAX], *new_name;
+
+       virtio_p9_pdu_readf(pdu, "dds", &fid_val, &new_fid_val, &new_name);
+       fid = get_fid(p9dev, fid_val);
+       new_fid = get_fid(p9dev, new_fid_val);
+
+       sprintf(full_path, "%s/%s", new_fid->abs_path, new_name);
+       ret = rename(fid->abs_path, full_path);
+       if (ret < 0)
+               goto err_out;
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_readlink(struct p9_dev *p9dev,
+                              struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret;
+       u32 fid_val;
+       struct p9_fid *fid;
+       char target_path[PATH_MAX];
+
+       virtio_p9_pdu_readf(pdu, "d", &fid_val);
+       fid = get_fid(p9dev, fid_val);
+
+       memset(target_path, 0, PATH_MAX);
+       ret = readlink(fid->abs_path, target_path, PATH_MAX - 1);
+       if (ret < 0)
+               goto err_out;
+
+       virtio_p9_pdu_writef(pdu, "s", target_path);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_statfs(struct p9_dev *p9dev,
+                            struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret;
+       u64 fsid;
+       u32 fid_val;
+       struct p9_fid *fid;
+       struct statfs stat_buf;
+
+       virtio_p9_pdu_readf(pdu, "d", &fid_val);
+       fid = get_fid(p9dev, fid_val);
+
+       ret = statfs(fid->abs_path, &stat_buf);
+       if (ret < 0)
+               goto err_out;
+       /* FIXME!! f_blocks needs update based on client msize */
+       fsid = (unsigned int) stat_buf.f_fsid.__val[0] |
+               (unsigned long long)stat_buf.f_fsid.__val[1] << 32;
+       virtio_p9_pdu_writef(pdu, "ddqqqqqqd", stat_buf.f_type,
+                            stat_buf.f_bsize, stat_buf.f_blocks,
+                            stat_buf.f_bfree, stat_buf.f_bavail,
+                            stat_buf.f_files, stat_buf.f_ffree,
+                            fsid, stat_buf.f_namelen);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_mknod(struct p9_dev *p9dev,
+                           struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret;
+       char *name;
+       struct stat st;
+       struct p9_fid *dfid;
+       struct p9_qid qid;
+       char full_path[PATH_MAX];
+       u32 fid_val, mode, major, minor, gid;
+
+       virtio_p9_pdu_readf(pdu, "dsdddd", &fid_val, &name, &mode,
+                           &major, &minor, &gid);
+
+       dfid = get_fid(p9dev, fid_val);
+       sprintf(full_path, "%s/%s", dfid->abs_path, name);
+       ret = mknod(full_path, mode, makedev(major, minor));
+       if (ret < 0)
+               goto err_out;
+
+       if (lstat(full_path, &st) < 0)
+               goto err_out;
+
+       ret = chmod(full_path, mode & 0777);
+       if (ret < 0)
+               goto err_out;
+
+       ret = lchown(full_path, dfid->uid, gid);
+       if (ret < 0)
+               goto err_out;
+
+       stat2qid(&st, &qid);
+       virtio_p9_pdu_writef(pdu, "Q", &qid);
+       free(name);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       free(name);
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_fsync(struct p9_dev *p9dev,
+                           struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret;
+       struct p9_fid *fid;
+       u32 fid_val, datasync;
+
+       virtio_p9_pdu_readf(pdu, "dd", &fid_val, &datasync);
+       fid = get_fid(p9dev, fid_val);
+
+       if (datasync)
+               ret = fdatasync(fid->fd);
+       else
+               ret = fsync(fid->fd);
+       if (ret < 0)
+               goto err_out;
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_symlink(struct p9_dev *p9dev,
+                             struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret;
+       struct stat st;
+       u32 fid_val, gid;
+       struct p9_qid qid;
+       struct p9_fid *dfid;
+       char new_name[PATH_MAX];
+       char *old_path, *name;
+
+       virtio_p9_pdu_readf(pdu, "dssd", &fid_val, &name, &old_path, &gid);
+
+       dfid = get_fid(p9dev, fid_val);
+       sprintf(new_name, "%s/%s", dfid->abs_path, name);
+       ret = symlink(old_path, new_name);
+       if (ret < 0)
+               goto err_out;
+
+       if (lstat(new_name, &st) < 0)
+               goto err_out;
+
+       ret = lchown(new_name, dfid->uid, gid);
+       if (ret < 0)
+               goto err_out;
+
+       stat2qid(&st, &qid);
+       virtio_p9_pdu_writef(pdu, "Q", &qid);
+       free(name);
+       free(old_path);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       free(name);
+       free(old_path);
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_link(struct p9_dev *p9dev,
+                          struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret;
+       char *name;
+       u32 fid_val, dfid_val;
+       struct p9_fid *dfid, *fid;
+       char full_path[PATH_MAX];
+
+       virtio_p9_pdu_readf(pdu, "dds", &dfid_val, &fid_val, &name);
+
+       dfid = get_fid(p9dev, dfid_val);
+       fid =  get_fid(p9dev, fid_val);
+       sprintf(full_path, "%s/%s", dfid->abs_path, name);
+       ret = link(fid->abs_path, full_path);
+       if (ret < 0)
+               goto err_out;
+       free(name);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       free(name);
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+
+}
+
+static void virtio_p9_lock(struct p9_dev *p9dev,
+                          struct p9_pdu *pdu, u32 *outlen)
+{
+       u8 ret;
+       u32 fid_val;
+       struct p9_flock flock;
+
+       virtio_p9_pdu_readf(pdu, "dbdqqds", &fid_val, &flock.type,
+                           &flock.flags, &flock.start, &flock.length,
+                           &flock.proc_id, &flock.client_id);
+
+       /* Just return success */
+       ret = P9_LOCK_SUCCESS;
+       virtio_p9_pdu_writef(pdu, "d", ret);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       free(flock.client_id);
+       return;
+}
+
+static void virtio_p9_getlock(struct p9_dev *p9dev,
+                             struct p9_pdu *pdu, u32 *outlen)
+{
+       u32 fid_val;
+       struct p9_getlock glock;
+       virtio_p9_pdu_readf(pdu, "dbqqds", &fid_val, &glock.type,
+                           &glock.start, &glock.length, &glock.proc_id,
+                           &glock.client_id);
+
+       /* Just return success */
+       glock.type = F_UNLCK;
+       virtio_p9_pdu_writef(pdu, "bqqds", glock.type,
+                            glock.start, glock.length, glock.proc_id,
+                            glock.client_id);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       free(glock.client_id);
+       return;
+}
+
+static int virtio_p9_ancestor(char *path, char *ancestor)
+{
+       int size = strlen(ancestor);
+       if (!strncmp(path, ancestor, size)) {
+               /*
+                * Now check whether ancestor is a full name or
+                * or directory component and not just part
+                * of a name.
+                */
+               if (path[size] == '\0' || path[size] == '/')
+                       return 1;
+       }
+       return 0;
+}
+
+static void virtio_p9_fix_path(char *fid_path, char *old_name, char *new_name)
+{
+       char tmp_name[PATH_MAX];
+       size_t rp_sz = strlen(old_name);
+
+       if (rp_sz == strlen(fid_path)) {
+               /* replace the full name */
+               strcpy(fid_path, new_name);
+               return;
+       }
+       /* save the trailing path details */
+       strcpy(tmp_name, fid_path + rp_sz);
+       sprintf(fid_path, "%s%s", new_name, tmp_name);
+       return;
+}
+
+static void rename_fids(struct p9_dev *p9dev, char *old_name, char *new_name)
+{
+       struct rb_node *node = rb_first(&p9dev->fids);
+
+       while (node) {
+               struct p9_fid *fid = rb_entry(node, struct p9_fid, node);
+
+               if (fid->fid != P9_NOFID && virtio_p9_ancestor(fid->path, old_name)) {
+                               virtio_p9_fix_path(fid->path, old_name, new_name);
+               }
+               node = rb_next(node);
+       }
+}
+
+static void virtio_p9_renameat(struct p9_dev *p9dev,
+                              struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret;
+       char *old_name, *new_name;
+       u32 old_dfid_val, new_dfid_val;
+       struct p9_fid *old_dfid, *new_dfid;
+       char old_full_path[PATH_MAX], new_full_path[PATH_MAX];
+
+
+       virtio_p9_pdu_readf(pdu, "dsds", &old_dfid_val, &old_name,
+                           &new_dfid_val, &new_name);
+
+       old_dfid = get_fid(p9dev, old_dfid_val);
+       new_dfid = get_fid(p9dev, new_dfid_val);
+
+       sprintf(old_full_path, "%s/%s", old_dfid->abs_path, old_name);
+       sprintf(new_full_path, "%s/%s", new_dfid->abs_path, new_name);
+       ret = rename(old_full_path, new_full_path);
+       if (ret < 0)
+               goto err_out;
+       /*
+        * Now fix path in other fids, if the renamed path is part of
+        * that.
+        */
+       rename_fids(p9dev, old_name, new_name);
+       free(old_name);
+       free(new_name);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       free(old_name);
+       free(new_name);
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_unlinkat(struct p9_dev *p9dev,
+                              struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret;
+       char *name;
+       u32 fid_val, flags;
+       struct p9_fid *fid;
+       char full_path[PATH_MAX];
+
+       virtio_p9_pdu_readf(pdu, "dsd", &fid_val, &name, &flags);
+       fid = get_fid(p9dev, fid_val);
+
+       sprintf(full_path, "%s/%s", fid->abs_path, name);
+       ret = remove(full_path);
+       if (ret < 0)
+               goto err_out;
+       free(name);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       free(name);
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_flush(struct p9_dev *p9dev,
+                               struct p9_pdu *pdu, u32 *outlen)
+{
+       u16 tag, oldtag;
+
+       virtio_p9_pdu_readf(pdu, "ww", &tag, &oldtag);
+       virtio_p9_pdu_writef(pdu, "w", tag);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+
+       return;
+}
+
+static void virtio_p9_eopnotsupp(struct p9_dev *p9dev,
+                                struct p9_pdu *pdu, u32 *outlen)
+{
+       return virtio_p9_error_reply(p9dev, pdu, EOPNOTSUPP, outlen);
+}
+
+typedef void p9_handler(struct p9_dev *p9dev,
+                       struct p9_pdu *pdu, u32 *outlen);
+
+/* FIXME should be removed when merging with latest linus tree */
+#define P9_TRENAMEAT 74
+#define P9_TUNLINKAT 76
+
+static p9_handler *virtio_9p_dotl_handler [] = {
+       [P9_TREADDIR]     = virtio_p9_readdir,
+       [P9_TSTATFS]      = virtio_p9_statfs,
+       [P9_TGETATTR]     = virtio_p9_getattr,
+       [P9_TSETATTR]     = virtio_p9_setattr,
+       [P9_TXATTRWALK]   = virtio_p9_eopnotsupp,
+       [P9_TXATTRCREATE] = virtio_p9_eopnotsupp,
+       [P9_TMKNOD]       = virtio_p9_mknod,
+       [P9_TLOCK]        = virtio_p9_lock,
+       [P9_TGETLOCK]     = virtio_p9_getlock,
+       [P9_TRENAMEAT]    = virtio_p9_renameat,
+       [P9_TREADLINK]    = virtio_p9_readlink,
+       [P9_TUNLINKAT]    = virtio_p9_unlinkat,
+       [P9_TMKDIR]       = virtio_p9_mkdir,
+       [P9_TVERSION]     = virtio_p9_version,
+       [P9_TLOPEN]       = virtio_p9_open,
+       [P9_TATTACH]      = virtio_p9_attach,
+       [P9_TWALK]        = virtio_p9_walk,
+       [P9_TCLUNK]       = virtio_p9_clunk,
+       [P9_TFSYNC]       = virtio_p9_fsync,
+       [P9_TREAD]        = virtio_p9_read,
+       [P9_TFLUSH]       = virtio_p9_flush,
+       [P9_TLINK]        = virtio_p9_link,
+       [P9_TSYMLINK]     = virtio_p9_symlink,
+       [P9_TLCREATE]     = virtio_p9_create,
+       [P9_TWRITE]       = virtio_p9_write,
+       [P9_TREMOVE]      = virtio_p9_remove,
+       [P9_TRENAME]      = virtio_p9_rename,
+};
+
+static struct p9_pdu *virtio_p9_pdu_init(struct kvm *kvm, struct virt_queue *vq)
+{
+       struct p9_pdu *pdu = calloc(1, sizeof(*pdu));
+       if (!pdu)
+               return NULL;
+
+       /* skip the pdu header p9_msg */
+       pdu->read_offset        = VIRTIO_9P_HDR_LEN;
+       pdu->write_offset       = VIRTIO_9P_HDR_LEN;
+       pdu->queue_head         = virt_queue__get_inout_iov(kvm, vq, pdu->in_iov,
+                                       pdu->out_iov, &pdu->in_iov_cnt, &pdu->out_iov_cnt);
+       return pdu;
+}
+
+static u8 virtio_p9_get_cmd(struct p9_pdu *pdu)
+{
+       struct p9_msg *msg;
+       /*
+        * we can peek directly into pdu for a u8
+        * value. The host endianess won't be an issue
+        */
+       msg = pdu->out_iov[0].iov_base;
+       return msg->cmd;
+}
+
+static bool virtio_p9_do_io_request(struct kvm *kvm, struct p9_dev_job *job)
+{
+       u8 cmd;
+       u32 len = 0;
+       p9_handler *handler;
+       struct p9_dev *p9dev;
+       struct virt_queue *vq;
+       struct p9_pdu *p9pdu;
+
+       vq = job->vq;
+       p9dev = job->p9dev;
+
+       p9pdu = virtio_p9_pdu_init(kvm, vq);
+       cmd = virtio_p9_get_cmd(p9pdu);
+
+       if ((cmd >= ARRAY_SIZE(virtio_9p_dotl_handler)) ||
+           !virtio_9p_dotl_handler[cmd])
+               handler = virtio_p9_eopnotsupp;
+       else
+               handler = virtio_9p_dotl_handler[cmd];
+
+       handler(p9dev, p9pdu, &len);
+       virt_queue__set_used_elem(vq, p9pdu->queue_head, len);
+       free(p9pdu);
+       return true;
+}
+
+static void virtio_p9_do_io(struct kvm *kvm, void *param)
+{
+       struct p9_dev_job *job = (struct p9_dev_job *)param;
+       struct p9_dev *p9dev   = job->p9dev;
+       struct virt_queue *vq  = job->vq;
+
+       while (virt_queue__available(vq)) {
+               virtio_p9_do_io_request(kvm, job);
+               p9dev->vdev.ops->signal_vq(kvm, &p9dev->vdev, vq - p9dev->vqs);
+       }
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+       struct p9_dev *p9dev = dev;
+
+       return ((u8 *)(p9dev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+       return 1 << VIRTIO_9P_MOUNT_TAG;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+       struct p9_dev *p9dev = dev;
+
+       p9dev->features = features;
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 pfn)
+{
+       struct p9_dev *p9dev = dev;
+       struct p9_dev_job *job;
+       struct virt_queue *queue;
+       void *p;
+
+       compat__remove_message(compat_id);
+
+       queue           = &p9dev->vqs[vq];
+       queue->pfn      = pfn;
+       p               = guest_pfn_to_host(kvm, queue->pfn);
+       job             = &p9dev->jobs[vq];
+
+       vring_init(&queue->vring, VIRTQUEUE_NUM, p, VIRTIO_PCI_VRING_ALIGN);
+
+       *job            = (struct p9_dev_job) {
+               .vq             = queue,
+               .p9dev          = p9dev,
+       };
+       thread_pool__init_job(&job->job_id, kvm, virtio_p9_do_io, job);
+
+       return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct p9_dev *p9dev = dev;
+
+       thread_pool__do_job(&p9dev->jobs[vq].job_id);
+
+       return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct p9_dev *p9dev = dev;
+
+       return p9dev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       return VIRTQUEUE_NUM;
+}
+
+struct virtio_ops p9_dev_virtio_ops = (struct virtio_ops) {
+       .get_config             = get_config,
+       .get_host_features      = get_host_features,
+       .set_guest_features     = set_guest_features,
+       .init_vq                = init_vq,
+       .notify_vq              = notify_vq,
+       .get_pfn_vq             = get_pfn_vq,
+       .get_size_vq            = get_size_vq,
+};
+
+int virtio_9p__init(struct kvm *kvm)
+{
+       struct p9_dev *p9dev;
+
+       list_for_each_entry(p9dev, &devs, list) {
+               virtio_init(kvm, p9dev, &p9dev->vdev, &p9_dev_virtio_ops,
+                           VIRTIO_PCI, PCI_DEVICE_ID_VIRTIO_9P, VIRTIO_ID_9P, PCI_CLASS_9P);
+       }
+
+       return 0;
+}
+
+int virtio_9p__register(struct kvm *kvm, const char *root, const char *tag_name)
+{
+       struct p9_dev *p9dev;
+       int err = 0;
+
+       p9dev = calloc(1, sizeof(*p9dev));
+       if (!p9dev)
+               return -ENOMEM;
+
+       if (!tag_name)
+               tag_name = VIRTIO_9P_DEFAULT_TAG;
+
+       p9dev->config = calloc(1, sizeof(*p9dev->config) + strlen(tag_name) + 1);
+       if (p9dev->config == NULL) {
+               err = -ENOMEM;
+               goto free_p9dev;
+       }
+
+       strcpy(p9dev->root_dir, root);
+       p9dev->config->tag_len = strlen(tag_name);
+       if (p9dev->config->tag_len > MAX_TAG_LEN) {
+               err = -EINVAL;
+               goto free_p9dev_config;
+       }
+
+       memcpy(&p9dev->config->tag, tag_name, strlen(tag_name));
+
+       list_add(&p9dev->list, &devs);
+
+       if (compat_id == -1)
+               compat_id = virtio_compat_add_message("virtio-9p", "CONFIG_NET_9P_VIRTIO");
+
+       return err;
+
+free_p9dev_config:
+       free(p9dev->config);
+free_p9dev:
+       free(p9dev);
+       return err;
+}
diff --git a/tools/kvm/virtio/balloon.c b/tools/kvm/virtio/balloon.c

new file mode 100644 (file)

index 0000000..ea64fd4
--- /dev/null
+++ b/tools/kvm/virtio/balloon.c
@@ -0,0 +1,259 @@
+#include "kvm/virtio-balloon.h"
+
+#include "kvm/virtio-pci-dev.h"
+
+#include "kvm/virtio.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/guest_compat.h"
+#include "kvm/kvm-ipc.h"
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_balloon.h>
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include <sys/eventfd.h>
+
+#define NUM_VIRT_QUEUES                3
+#define VIRTIO_BLN_QUEUE_SIZE  128
+#define VIRTIO_BLN_INFLATE     0
+#define VIRTIO_BLN_DEFLATE     1
+#define VIRTIO_BLN_STATS       2
+
+struct bln_dev {
+       struct list_head        list;
+       struct virtio_device    vdev;
+
+       u32                     features;
+
+       /* virtio queue */
+       struct virt_queue       vqs[NUM_VIRT_QUEUES];
+       struct thread_pool__job jobs[NUM_VIRT_QUEUES];
+
+       struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
+       struct virtio_balloon_stat *cur_stat;
+       u32                     cur_stat_head;
+       u16                     stat_count;
+       int                     stat_waitfd;
+
+       struct virtio_balloon_config config;
+};
+
+static struct bln_dev bdev;
+extern struct kvm *kvm;
+static int compat_id = -1;
+
+static bool virtio_bln_do_io_request(struct kvm *kvm, struct bln_dev *bdev, struct virt_queue *queue)
+{
+       struct iovec iov[VIRTIO_BLN_QUEUE_SIZE];
+       unsigned int len = 0;
+       u16 out, in, head;
+       u32 *ptrs, i;
+
+       head    = virt_queue__get_iov(queue, iov, &out, &in, kvm);
+       ptrs    = iov[0].iov_base;
+       len     = iov[0].iov_len / sizeof(u32);
+
+       for (i = 0 ; i < len ; i++) {
+               void *guest_ptr;
+
+               guest_ptr = guest_flat_to_host(kvm, ptrs[i] << VIRTIO_BALLOON_PFN_SHIFT);
+               if (queue == &bdev->vqs[VIRTIO_BLN_INFLATE]) {
+                       madvise(guest_ptr, 1 << VIRTIO_BALLOON_PFN_SHIFT, MADV_DONTNEED);
+                       bdev->config.actual++;
+               } else if (queue == &bdev->vqs[VIRTIO_BLN_DEFLATE]) {
+                       bdev->config.actual--;
+               }
+       }
+
+       virt_queue__set_used_elem(queue, head, len);
+
+       return true;
+}
+
+static bool virtio_bln_do_stat_request(struct kvm *kvm, struct bln_dev *bdev, struct virt_queue *queue)
+{
+       struct iovec iov[VIRTIO_BLN_QUEUE_SIZE];
+       u16 out, in, head;
+       struct virtio_balloon_stat *stat;
+       u64 wait_val = 1;
+
+       head = virt_queue__get_iov(queue, iov, &out, &in, kvm);
+       stat = iov[0].iov_base;
+
+       /* Initial empty stat buffer */
+       if (bdev->cur_stat == NULL) {
+               bdev->cur_stat = stat;
+               bdev->cur_stat_head = head;
+
+               return true;
+       }
+
+       memcpy(bdev->stats, stat, iov[0].iov_len);
+
+       bdev->stat_count = iov[0].iov_len / sizeof(struct virtio_balloon_stat);
+       bdev->cur_stat = stat;
+       bdev->cur_stat_head = head;
+
+       if (write(bdev->stat_waitfd, &wait_val, sizeof(wait_val)) <= 0)
+               return -EFAULT;
+
+       return 1;
+}
+
+static void virtio_bln_do_io(struct kvm *kvm, void *param)
+{
+       struct virt_queue *vq = param;
+
+       if (vq == &bdev.vqs[VIRTIO_BLN_STATS]) {
+               virtio_bln_do_stat_request(kvm, &bdev, vq);
+               bdev.vdev.ops->signal_vq(kvm, &bdev.vdev, VIRTIO_BLN_STATS);
+               return;
+       }
+
+       while (virt_queue__available(vq)) {
+               virtio_bln_do_io_request(kvm, &bdev, vq);
+               bdev.vdev.ops->signal_vq(kvm, &bdev.vdev, vq - bdev.vqs);
+       }
+}
+
+static int virtio_bln__collect_stats(void)
+{
+       u64 tmp;
+
+       virt_queue__set_used_elem(&bdev.vqs[VIRTIO_BLN_STATS], bdev.cur_stat_head,
+                                 sizeof(struct virtio_balloon_stat));
+       bdev.vdev.ops->signal_vq(kvm, &bdev.vdev, VIRTIO_BLN_STATS);
+
+       if (read(bdev.stat_waitfd, &tmp, sizeof(tmp)) <= 0)
+               return -EFAULT;
+
+       return 0;
+}
+
+static void virtio_bln__print_stats(int fd, u32 type, u32 len, u8 *msg)
+{
+       int r;
+
+       if (WARN_ON(type != KVM_IPC_STAT || len))
+               return;
+
+       if (virtio_bln__collect_stats() < 0)
+               return;
+
+       r = write(fd, bdev.stats, sizeof(bdev.stats));
+       if (r < 0)
+               pr_warning("Failed sending memory stats");
+}
+
+static void handle_mem(int fd, u32 type, u32 len, u8 *msg)
+{
+       int mem;
+
+       if (WARN_ON(type != KVM_IPC_BALLOON || len != sizeof(int)))
+               return;
+
+       mem = *(int *)msg;
+       if (mem > 0) {
+               bdev.config.num_pages += 256 * mem;
+       } else if (mem < 0) {
+               if (bdev.config.num_pages < (u32)(256 * (-mem)))
+                       return;
+
+               bdev.config.num_pages += 256 * mem;
+       }
+
+       /* Notify that the configuration space has changed */
+       bdev.vdev.ops->signal_config(kvm, &bdev.vdev);
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+       struct bln_dev *bdev = dev;
+
+       return ((u8 *)(&bdev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+       return 1 << VIRTIO_BALLOON_F_STATS_VQ;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+       struct bln_dev *bdev = dev;
+
+       bdev->features = features;
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 pfn)
+{
+       struct bln_dev *bdev = dev;
+       struct virt_queue *queue;
+       void *p;
+
+       compat__remove_message(compat_id);
+
+       queue           = &bdev->vqs[vq];
+       queue->pfn      = pfn;
+       p               = guest_pfn_to_host(kvm, queue->pfn);
+
+       thread_pool__init_job(&bdev->jobs[vq], kvm, virtio_bln_do_io, queue);
+       vring_init(&queue->vring, VIRTIO_BLN_QUEUE_SIZE, p, VIRTIO_PCI_VRING_ALIGN);
+
+       return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct bln_dev *bdev = dev;
+
+       thread_pool__do_job(&bdev->jobs[vq]);
+
+       return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct bln_dev *bdev = dev;
+
+       return bdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       return VIRTIO_BLN_QUEUE_SIZE;
+}
+
+struct virtio_ops bln_dev_virtio_ops = (struct virtio_ops) {
+       .get_config             = get_config,
+       .get_host_features      = get_host_features,
+       .set_guest_features     = set_guest_features,
+       .init_vq                = init_vq,
+       .notify_vq              = notify_vq,
+       .get_pfn_vq             = get_pfn_vq,
+       .get_size_vq            = get_size_vq,
+};
+
+void virtio_bln__init(struct kvm *kvm)
+{
+       kvm_ipc__register_handler(KVM_IPC_BALLOON, handle_mem);
+       kvm_ipc__register_handler(KVM_IPC_STAT, virtio_bln__print_stats);
+
+       bdev.stat_waitfd        = eventfd(0, 0);
+       memset(&bdev.config, 0, sizeof(struct virtio_balloon_config));
+
+       virtio_init(kvm, &bdev, &bdev.vdev, &bln_dev_virtio_ops,
+                   VIRTIO_PCI, PCI_DEVICE_ID_VIRTIO_BLN, VIRTIO_ID_BALLOON, PCI_CLASS_BLN);
+
+       if (compat_id == -1)
+               compat_id = virtio_compat_add_message("virtio-balloon", "CONFIG_VIRTIO_BALLOON");
+}
diff --git a/tools/kvm/virtio/blk.c b/tools/kvm/virtio/blk.c

new file mode 100644 (file)

index 0000000..98f17a2
--- /dev/null
+++ b/tools/kvm/virtio/blk.c
@@ -0,0 +1,313 @@
+#include "kvm/virtio-blk.h"
+
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/disk-image.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/guest_compat.h"
+#include "kvm/virtio-pci.h"
+#include "kvm/virtio.h"
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_blk.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <pthread.h>
+
+#define VIRTIO_BLK_MAX_DEV             4
+
+/*
+ * the header and status consume too entries
+ */
+#define DISK_SEG_MAX                   (VIRTIO_BLK_QUEUE_SIZE - 2)
+#define VIRTIO_BLK_QUEUE_SIZE          256
+#define NUM_VIRT_QUEUES                        1
+
+struct blk_dev_req {
+       struct virt_queue               *vq;
+       struct blk_dev                  *bdev;
+       struct iovec                    iov[VIRTIO_BLK_QUEUE_SIZE];
+       u16                             out, in, head;
+       struct kvm                      *kvm;
+};
+
+struct blk_dev {
+       pthread_mutex_t                 mutex;
+
+       struct list_head                list;
+
+       struct virtio_device            vdev;
+       struct virtio_blk_config        blk_config;
+       struct disk_image               *disk;
+       u32                             features;
+
+       struct virt_queue               vqs[NUM_VIRT_QUEUES];
+       struct blk_dev_req              reqs[VIRTIO_BLK_QUEUE_SIZE];
+
+       pthread_t                       io_thread;
+       int                             io_efd;
+
+       struct kvm                      *kvm;
+};
+
+static LIST_HEAD(bdevs);
+static int compat_id = -1;
+
+void virtio_blk_complete(void *param, long len)
+{
+       struct blk_dev_req *req = param;
+       struct blk_dev *bdev = req->bdev;
+       int queueid = req->vq - bdev->vqs;
+       u8 *status;
+
+       /* status */
+       status  = req->iov[req->out + req->in - 1].iov_base;
+       *status = (len < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+
+       mutex_lock(&bdev->mutex);
+       virt_queue__set_used_elem(req->vq, req->head, len);
+       mutex_unlock(&bdev->mutex);
+
+       if (virtio_queue__should_signal(&bdev->vqs[queueid]))
+               bdev->vdev.ops->signal_vq(req->kvm, &bdev->vdev, queueid);
+}
+
+static void virtio_blk_do_io_request(struct kvm *kvm, struct blk_dev_req *req)
+{
+       struct virtio_blk_outhdr *req_hdr;
+       ssize_t block_cnt;
+       struct blk_dev *bdev;
+       struct iovec *iov;
+       u16 out, in;
+
+       block_cnt       = -1;
+       bdev            = req->bdev;
+       iov             = req->iov;
+       out             = req->out;
+       in              = req->in;
+       req_hdr         = iov[0].iov_base;
+
+       switch (req_hdr->type) {
+       case VIRTIO_BLK_T_IN:
+               block_cnt = disk_image__read(bdev->disk, req_hdr->sector,
+                               iov + 1, in + out - 2, req);
+               break;
+       case VIRTIO_BLK_T_OUT:
+               block_cnt = disk_image__write(bdev->disk, req_hdr->sector,
+                               iov + 1, in + out - 2, req);
+               break;
+       case VIRTIO_BLK_T_FLUSH:
+               block_cnt = disk_image__flush(bdev->disk);
+               virtio_blk_complete(req, block_cnt);
+               break;
+       case VIRTIO_BLK_T_GET_ID:
+               block_cnt = VIRTIO_BLK_ID_BYTES;
+               disk_image__get_serial(bdev->disk,
+                               (iov + 1)->iov_base, &block_cnt);
+               virtio_blk_complete(req, block_cnt);
+               break;
+       default:
+               pr_warning("request type %d", req_hdr->type);
+               block_cnt       = -1;
+               break;
+       }
+}
+
+static void virtio_blk_do_io(struct kvm *kvm, struct virt_queue *vq, struct blk_dev *bdev)
+{
+       struct blk_dev_req *req;
+       u16 head;
+
+       while (virt_queue__available(vq)) {
+               head            = virt_queue__pop(vq);
+               req             = &bdev->reqs[head];
+               req->head       = virt_queue__get_head_iov(vq, req->iov, &req->out,
+                                       &req->in, head, kvm);
+               req->vq         = vq;
+
+               virtio_blk_do_io_request(kvm, req);
+       }
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+       struct blk_dev *bdev = dev;
+
+       return ((u8 *)(&bdev->blk_config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+       return  1UL << VIRTIO_BLK_F_SEG_MAX
+               | 1UL << VIRTIO_BLK_F_FLUSH
+               | 1UL << VIRTIO_RING_F_EVENT_IDX
+               | 1UL << VIRTIO_RING_F_INDIRECT_DESC;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+       struct blk_dev *bdev = dev;
+
+       bdev->features = features;
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 pfn)
+{
+       struct blk_dev *bdev = dev;
+       struct virt_queue *queue;
+       void *p;
+
+       compat__remove_message(compat_id);
+
+       queue           = &bdev->vqs[vq];
+       queue->pfn      = pfn;
+       p               = guest_pfn_to_host(kvm, queue->pfn);
+
+       vring_init(&queue->vring, VIRTIO_BLK_QUEUE_SIZE, p, VIRTIO_PCI_VRING_ALIGN);
+
+       return 0;
+}
+
+static void *virtio_blk_thread(void *dev)
+{
+       struct blk_dev *bdev = dev;
+       u64 data;
+       int r;
+
+       while (1) {
+               r = read(bdev->io_efd, &data, sizeof(u64));
+               if (r < 0)
+                       continue;
+               virtio_blk_do_io(bdev->kvm, &bdev->vqs[0], bdev);
+       }
+
+       pthread_exit(NULL);
+       return NULL;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct blk_dev *bdev = dev;
+       u64 data = 1;
+       int r;
+
+       r = write(bdev->io_efd, &data, sizeof(data));
+       if (r < 0)
+               return r;
+
+       return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct blk_dev *bdev = dev;
+
+       return bdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       /* FIXME: dynamic */
+       return VIRTIO_BLK_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+       /* FIXME: dynamic */
+       return size;
+}
+
+static struct virtio_ops blk_dev_virtio_ops = (struct virtio_ops) {
+       .get_config             = get_config,
+       .get_host_features      = get_host_features,
+       .set_guest_features     = set_guest_features,
+       .init_vq                = init_vq,
+       .notify_vq              = notify_vq,
+       .get_pfn_vq             = get_pfn_vq,
+       .get_size_vq            = get_size_vq,
+       .set_size_vq            = set_size_vq,
+};
+
+static int virtio_blk__init_one(struct kvm *kvm, struct disk_image *disk)
+{
+       struct blk_dev *bdev;
+       unsigned int i;
+
+       if (!disk)
+               return -EINVAL;
+
+       bdev = calloc(1, sizeof(struct blk_dev));
+       if (bdev == NULL)
+               return -ENOMEM;
+
+       *bdev = (struct blk_dev) {
+               .mutex                  = PTHREAD_MUTEX_INITIALIZER,
+               .disk                   = disk,
+               .blk_config             = (struct virtio_blk_config) {
+                       .capacity       = disk->size / SECTOR_SIZE,
+                       .seg_max        = DISK_SEG_MAX,
+               },
+               .io_efd                 = eventfd(0, 0),
+               .kvm                    = kvm,
+       };
+
+       virtio_init(kvm, bdev, &bdev->vdev, &blk_dev_virtio_ops,
+                   VIRTIO_PCI, PCI_DEVICE_ID_VIRTIO_BLK, VIRTIO_ID_BLOCK, PCI_CLASS_BLK);
+
+       list_add_tail(&bdev->list, &bdevs);
+
+       for (i = 0; i < ARRAY_SIZE(bdev->reqs); i++) {
+               bdev->reqs[i].bdev = bdev;
+               bdev->reqs[i].kvm = kvm;
+       }
+
+       disk_image__set_callback(bdev->disk, virtio_blk_complete);
+
+       pthread_create(&bdev->io_thread, NULL, virtio_blk_thread, bdev);
+       if (compat_id == -1)
+               compat_id = virtio_compat_add_message("virtio-blk", "CONFIG_VIRTIO_BLK");
+
+       return 0;
+}
+
+static int virtio_blk__exit_one(struct kvm *kvm, struct blk_dev *bdev)
+{
+       list_del(&bdev->list);
+       free(bdev);
+
+       return 0;
+}
+
+int virtio_blk__init(struct kvm *kvm)
+{
+       int i, r = 0;
+
+       for (i = 0; i < kvm->nr_disks; i++) {
+               if (kvm->disks[i]->wwpn)
+                       continue;
+               r = virtio_blk__init_one(kvm, kvm->disks[i]);
+               if (r < 0)
+                       goto cleanup;
+       }
+
+       return 0;
+cleanup:
+       return virtio_blk__exit(kvm);
+}
+
+int virtio_blk__exit(struct kvm *kvm)
+{
+       while (!list_empty(&bdevs)) {
+               struct blk_dev *bdev;
+
+               bdev = list_first_entry(&bdevs, struct blk_dev, list);
+               virtio_blk__exit_one(kvm, bdev);
+       }
+
+       return 0;
+}
diff --git a/tools/kvm/virtio/console.c b/tools/kvm/virtio/console.c

new file mode 100644 (file)

index 0000000..e925a54
--- /dev/null
+++ b/tools/kvm/virtio/console.c
@@ -0,0 +1,185 @@
+#include "kvm/virtio-console.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/disk-image.h"
+#include "kvm/virtio.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "kvm/mutex.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/irq.h"
+#include "kvm/guest_compat.h"
+
+#include <linux/virtio_console.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_blk.h>
+
+#include <sys/uio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <termios.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define VIRTIO_CONSOLE_QUEUE_SIZE      128
+#define VIRTIO_CONSOLE_NUM_QUEUES      2
+#define VIRTIO_CONSOLE_RX_QUEUE                0
+#define VIRTIO_CONSOLE_TX_QUEUE                1
+
+struct con_dev {
+       pthread_mutex_t                 mutex;
+
+       struct virtio_device            vdev;
+       struct virt_queue               vqs[VIRTIO_CONSOLE_NUM_QUEUES];
+       struct virtio_console_config    config;
+       u32                             features;
+
+       struct thread_pool__job         jobs[VIRTIO_CONSOLE_NUM_QUEUES];
+};
+
+static struct con_dev cdev = {
+       .mutex                          = PTHREAD_MUTEX_INITIALIZER,
+
+       .config = {
+               .cols                   = 80,
+               .rows                   = 24,
+               .max_nr_ports           = 1,
+       },
+};
+
+static int compat_id = -1;
+
+/*
+ * Interrupts are injected for hvc0 only.
+ */
+static void virtio_console__inject_interrupt_callback(struct kvm *kvm, void *param)
+{
+       struct iovec iov[VIRTIO_CONSOLE_QUEUE_SIZE];
+       struct virt_queue *vq;
+       u16 out, in;
+       u16 head;
+       int len;
+
+       mutex_lock(&cdev.mutex);
+
+       vq = param;
+
+       if (term_readable(CONSOLE_VIRTIO, 0) && virt_queue__available(vq)) {
+               head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+               len = term_getc_iov(CONSOLE_VIRTIO, iov, in, 0);
+               virt_queue__set_used_elem(vq, head, len);
+               cdev.vdev.ops->signal_vq(kvm, &cdev.vdev, vq - cdev.vqs);
+       }
+
+       mutex_unlock(&cdev.mutex);
+}
+
+void virtio_console__inject_interrupt(struct kvm *kvm)
+{
+       thread_pool__do_job(&cdev.jobs[VIRTIO_CONSOLE_RX_QUEUE]);
+}
+
+static void virtio_console_handle_callback(struct kvm *kvm, void *param)
+{
+       struct iovec iov[VIRTIO_CONSOLE_QUEUE_SIZE];
+       struct virt_queue *vq;
+       u16 out, in;
+       u16 head;
+       u32 len;
+
+       vq = param;
+
+       /*
+        * The current Linux implementation polls for the buffer
+        * to be used, rather than waiting for an interrupt.
+        * So there is no need to inject an interrupt for the tx path.
+        */
+
+       while (virt_queue__available(vq)) {
+               head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+               len = term_putc_iov(CONSOLE_VIRTIO, iov, out, 0);
+               virt_queue__set_used_elem(vq, head, len);
+       }
+
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+       struct con_dev *cdev = dev;
+
+       return ((u8 *)(&cdev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+       return 0;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+       /* Unused */
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 pfn)
+{
+       struct virt_queue *queue;
+       void *p;
+
+       BUG_ON(vq >= VIRTIO_CONSOLE_NUM_QUEUES);
+
+       compat__remove_message(compat_id);
+
+       queue           = &cdev.vqs[vq];
+       queue->pfn      = pfn;
+       p               = guest_pfn_to_host(kvm, queue->pfn);
+
+       vring_init(&queue->vring, VIRTIO_CONSOLE_QUEUE_SIZE, p, VIRTIO_PCI_VRING_ALIGN);
+
+       if (vq == VIRTIO_CONSOLE_TX_QUEUE)
+               thread_pool__init_job(&cdev.jobs[vq], kvm, virtio_console_handle_callback, queue);
+       else if (vq == VIRTIO_CONSOLE_RX_QUEUE)
+               thread_pool__init_job(&cdev.jobs[vq], kvm, virtio_console__inject_interrupt_callback, queue);
+
+       return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct con_dev *cdev = dev;
+
+       thread_pool__do_job(&cdev->jobs[vq]);
+
+       return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct con_dev *cdev = dev;
+
+       return cdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       return VIRTIO_CONSOLE_QUEUE_SIZE;
+}
+
+static struct virtio_ops con_dev_virtio_ops = (struct virtio_ops) {
+       .get_config             = get_config,
+       .get_host_features      = get_host_features,
+       .set_guest_features     = set_guest_features,
+       .init_vq                = init_vq,
+       .notify_vq              = notify_vq,
+       .get_pfn_vq             = get_pfn_vq,
+       .get_size_vq            = get_size_vq,
+};
+
+void virtio_console__init(struct kvm *kvm)
+{
+       virtio_init(kvm, &cdev, &cdev.vdev, &con_dev_virtio_ops,
+                   VIRTIO_PCI, PCI_DEVICE_ID_VIRTIO_CONSOLE, VIRTIO_ID_CONSOLE, PCI_CLASS_CONSOLE);
+       if (compat_id == -1)
+               compat_id = virtio_compat_add_message("virtio-console", "CONFIG_VIRTIO_CONSOLE");
+}
diff --git a/tools/kvm/virtio/core.c b/tools/kvm/virtio/core.c

new file mode 100644 (file)

index 0000000..2dfb828
--- /dev/null
+++ b/tools/kvm/virtio/core.c
@@ -0,0 +1,233 @@
+#include <linux/virtio_ring.h>
+#include <linux/types.h>
+#include <sys/uio.h>
+#include <stdlib.h>
+
+#include "kvm/guest_compat.h"
+#include "kvm/barrier.h"
+#include "kvm/virtio.h"
+#include "kvm/virtio-pci.h"
+#include "kvm/virtio-mmio.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+
+struct vring_used_elem *virt_queue__set_used_elem(struct virt_queue *queue, u32 head, u32 len)
+{
+       struct vring_used_elem *used_elem;
+
+       used_elem       = &queue->vring.used->ring[queue->vring.used->idx % queue->vring.num];
+       used_elem->id   = head;
+       used_elem->len  = len;
+
+       /*
+        * Use wmb to assure that used elem was updated with head and len.
+        * We need a wmb here since we can't advance idx unless we're ready
+        * to pass the used element to the guest.
+        */
+       wmb();
+       queue->vring.used->idx++;
+
+       /*
+        * Use wmb to assure used idx has been increased before we signal the guest.
+        * Without a wmb here the guest may ignore the queue since it won't see
+        * an updated idx.
+        */
+       wmb();
+
+       return used_elem;
+}
+
+/*
+ * Each buffer in the virtqueues is actually a chain of descriptors.  This
+ * function returns the next descriptor in the chain, or vq->vring.num if we're
+ * at the end.
+ */
+static unsigned next_desc(struct vring_desc *desc,
+                         unsigned int i, unsigned int max)
+{
+       unsigned int next;
+
+       /* If this descriptor says it doesn't chain, we're done. */
+       if (!(desc[i].flags & VRING_DESC_F_NEXT))
+               return max;
+
+       /* Check they're not leading us off end of descriptors. */
+       next = desc[i].next;
+       /* Make sure compiler knows to grab that: we don't want it changing! */
+       wmb();
+
+       return next;
+}
+
+u16 virt_queue__get_head_iov(struct virt_queue *vq, struct iovec iov[], u16 *out, u16 *in, u16 head, struct kvm *kvm)
+{
+       struct vring_desc *desc;
+       u16 idx;
+       u16 max;
+
+       idx = head;
+       *out = *in = 0;
+       max = vq->vring.num;
+       desc = vq->vring.desc;
+
+       if (desc[idx].flags & VRING_DESC_F_INDIRECT) {
+               max = desc[idx].len / sizeof(struct vring_desc);
+               desc = guest_flat_to_host(kvm, desc[idx].addr);
+               idx = 0;
+       }
+
+       do {
+               /* Grab the first descriptor, and check it's OK. */
+               iov[*out + *in].iov_len = desc[idx].len;
+               iov[*out + *in].iov_base = guest_flat_to_host(kvm, desc[idx].addr);
+               /* If this is an input descriptor, increment that count. */
+               if (desc[idx].flags & VRING_DESC_F_WRITE)
+                       (*in)++;
+               else
+                       (*out)++;
+       } while ((idx = next_desc(desc, idx, max)) != max);
+
+       return head;
+}
+
+u16 virt_queue__get_iov(struct virt_queue *vq, struct iovec iov[], u16 *out, u16 *in, struct kvm *kvm)
+{
+       u16 head;
+
+       head = virt_queue__pop(vq);
+
+       return virt_queue__get_head_iov(vq, iov, out, in, head, kvm);
+}
+
+/* in and out are relative to guest */
+u16 virt_queue__get_inout_iov(struct kvm *kvm, struct virt_queue *queue,
+                             struct iovec in_iov[], struct iovec out_iov[],
+                             u16 *in, u16 *out)
+{
+       struct vring_desc *desc;
+       u16 head, idx;
+
+       idx = head = virt_queue__pop(queue);
+       *out = *in = 0;
+       do {
+               desc = virt_queue__get_desc(queue, idx);
+               if (desc->flags & VRING_DESC_F_WRITE) {
+                       in_iov[*in].iov_base = guest_flat_to_host(kvm,
+                                                                 desc->addr);
+                       in_iov[*in].iov_len = desc->len;
+                       (*in)++;
+               } else {
+                       out_iov[*out].iov_base = guest_flat_to_host(kvm,
+                                                                   desc->addr);
+                       out_iov[*out].iov_len = desc->len;
+                       (*out)++;
+               }
+               if (desc->flags & VRING_DESC_F_NEXT)
+                       idx = desc->next;
+               else
+                       break;
+       } while (1);
+
+       return head;
+}
+
+int virtio__get_dev_specific_field(int offset, bool msix, u32 *config_off)
+{
+       if (msix) {
+               if (offset < 4)
+                       return VIRTIO_PCI_O_MSIX;
+               else
+                       offset -= 4;
+       }
+
+       *config_off = offset;
+
+       return VIRTIO_PCI_O_CONFIG;
+}
+
+bool virtio_queue__should_signal(struct virt_queue *vq)
+{
+       u16 old_idx, new_idx, event_idx;
+
+       old_idx         = vq->last_used_signalled;
+       new_idx         = vq->vring.used->idx;
+       event_idx       = vring_used_event(&vq->vring);
+
+       if (vring_need_event(event_idx, new_idx, old_idx)) {
+               vq->last_used_signalled = new_idx;
+               return true;
+       }
+
+       return false;
+}
+
+int virtio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+               struct virtio_ops *ops, enum virtio_trans trans,
+               int device_id, int subsys_id, int class)
+{
+       void *virtio;
+
+       switch (trans) {
+       case VIRTIO_PCI:
+               virtio = calloc(sizeof(struct virtio_pci), 1);
+               if (!virtio)
+                       return -ENOMEM;
+               vdev->virtio                    = virtio;
+               vdev->ops                       = ops;
+               vdev->ops->signal_vq            = virtio_pci__signal_vq;
+               vdev->ops->signal_config        = virtio_pci__signal_config;
+               vdev->ops->init                 = virtio_pci__init;
+               vdev->ops->exit                 = virtio_pci__exit;
+               vdev->ops->init(kvm, dev, vdev, device_id, subsys_id, class);
+               break;
+       case VIRTIO_MMIO:
+               virtio = calloc(sizeof(struct virtio_mmio), 1);
+               if (!virtio)
+                       return -ENOMEM;
+               vdev->virtio                    = virtio;
+               vdev->ops                       = ops;
+               vdev->ops->signal_vq            = virtio_mmio_signal_vq;
+               vdev->ops->signal_config        = virtio_mmio_signal_config;
+               vdev->ops->init                 = virtio_mmio_init;
+               vdev->ops->exit                 = virtio_mmio_exit;
+               vdev->ops->init(kvm, dev, vdev, device_id, subsys_id, class);
+               break;
+       default:
+               return -1;
+       };
+
+       return 0;
+}
+
+int virtio_compat_add_message(const char *device, const char *config)
+{
+       int len = 1024;
+       int compat_id;
+       char *title;
+       char *desc;
+
+       title = malloc(len);
+       if (!title)
+               return -ENOMEM;
+
+       desc = malloc(len);
+       if (!desc) {
+               free(title);
+               return -ENOMEM;
+       }
+
+       snprintf(title, len, "%s device was not detected.", device);
+       snprintf(desc,  len, "While you have requested a %s device, "
+                            "the guest kernel did not initialize it.\n"
+                            "\tPlease make sure that the guest kernel was "
+                            "compiled with %s=y enabled in .config.",
+                            device, config);
+
+       compat_id = compat__add_message(title, desc);
+
+       free(desc);
+       free(title);
+
+       return compat_id;
+}
diff --git a/tools/kvm/virtio/mmio.c b/tools/kvm/virtio/mmio.c

new file mode 100644 (file)

index 0000000..6ec33ec
--- /dev/null
+++ b/tools/kvm/virtio/mmio.c
@@ -0,0 +1,264 @@
+#include "kvm/virtio-mmio.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/ioport.h"
+#include "kvm/virtio.h"
+#include "kvm/kvm.h"
+#include "kvm/irq.h"
+
+#include <linux/virtio_mmio.h>
+#include <string.h>
+
+static u32 virtio_mmio_io_space_blocks = KVM_VIRTIO_MMIO_AREA;
+
+static u32 virtio_mmio_get_io_space_block(u32 size)
+{
+       u32 block = virtio_mmio_io_space_blocks;
+       virtio_mmio_io_space_blocks += size;
+
+       return block;
+}
+
+static void virtio_mmio_ioevent_callback(struct kvm *kvm, void *param)
+{
+       struct virtio_mmio_ioevent_param *ioeventfd = param;
+       struct virtio_mmio *vmmio = ioeventfd->vdev->virtio;
+
+       ioeventfd->vdev->ops->notify_vq(kvm, vmmio->dev, ioeventfd->vq);
+}
+
+static int virtio_mmio_init_ioeventfd(struct kvm *kvm,
+                                     struct virtio_device *vdev, u32 vq)
+{
+       struct virtio_mmio *vmmio = vdev->virtio;
+       struct ioevent ioevent;
+       int err;
+
+       vmmio->ioeventfds[vq] = (struct virtio_mmio_ioevent_param) {
+               .vdev           = vdev,
+               .vq             = vq,
+       };
+
+       ioevent = (struct ioevent) {
+               .io_addr        = vmmio->addr + VIRTIO_MMIO_QUEUE_NOTIFY,
+               .io_len         = sizeof(u32),
+               .fn             = virtio_mmio_ioevent_callback,
+               .fn_ptr         = &vmmio->ioeventfds[vq],
+               .datamatch      = vq,
+               .fn_kvm         = kvm,
+               .fd             = eventfd(0, 0),
+       };
+
+       if (vdev->use_vhost)
+               /*
+                * Vhost will poll the eventfd in host kernel side,
+                * no need to poll in userspace.
+                */
+               err = ioeventfd__add_event(&ioevent, true, false);
+       else
+               /* Need to poll in userspace. */
+               err = ioeventfd__add_event(&ioevent, true, true);
+       if (err)
+               return err;
+
+       if (vdev->ops->notify_vq_eventfd)
+               vdev->ops->notify_vq_eventfd(kvm, vmmio->dev, vq, ioevent.fd);
+
+       return 0;
+}
+
+int virtio_mmio_signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq)
+{
+       struct virtio_mmio *vmmio = vdev->virtio;
+
+       vmmio->hdr.interrupt_state |= VIRTIO_MMIO_INT_VRING;
+       kvm__irq_trigger(vmmio->kvm, vmmio->irq);
+
+       return 0;
+}
+
+int virtio_mmio_signal_config(struct kvm *kvm, struct virtio_device *vdev)
+{
+       struct virtio_mmio *vmmio = vdev->virtio;
+
+       vmmio->hdr.interrupt_state |= VIRTIO_MMIO_INT_CONFIG;
+       kvm__irq_trigger(vmmio->kvm, vmmio->irq);
+
+       return 0;
+}
+
+static void virtio_mmio_device_specific(u64 addr, u8 *data, u32 len,
+                                       u8 is_write, struct virtio_device *vdev)
+{
+       struct virtio_mmio *vmmio = vdev->virtio;
+       u32 i;
+
+       for (i = 0; i < len; i++) {
+               if (is_write)
+                       vdev->ops->get_config(vmmio->kvm, vmmio->dev)[addr + i] =
+                                             *(u8 *)data + i;
+               else
+                       data[i] = vdev->ops->get_config(vmmio->kvm,
+                                                       vmmio->dev)[addr + i];
+       }
+}
+
+static void virtio_mmio_config_in(u64 addr, void *data, u32 len,
+                                 struct virtio_device *vdev)
+{
+       struct virtio_mmio *vmmio = vdev->virtio;
+       u32 val = 0;
+
+       switch (addr) {
+       case VIRTIO_MMIO_MAGIC_VALUE:
+       case VIRTIO_MMIO_VERSION:
+       case VIRTIO_MMIO_DEVICE_ID:
+       case VIRTIO_MMIO_VENDOR_ID:
+       case VIRTIO_MMIO_STATUS:
+       case VIRTIO_MMIO_INTERRUPT_STATUS:
+               ioport__write32(data, *(u32 *)(((void *)&vmmio->hdr) + addr));
+               break;
+       case VIRTIO_MMIO_HOST_FEATURES:
+               if (vmmio->hdr.host_features_sel == 0)
+                       val = vdev->ops->get_host_features(vmmio->kvm,
+                                                          vmmio->dev);
+               ioport__write32(data, val);
+               break;
+       case VIRTIO_MMIO_QUEUE_PFN:
+               val = vdev->ops->get_pfn_vq(vmmio->kvm, vmmio->dev,
+                                           vmmio->hdr.queue_sel);
+               ioport__write32(data, val);
+               break;
+       case VIRTIO_MMIO_QUEUE_NUM_MAX:
+               val = vdev->ops->get_size_vq(vmmio->kvm, vmmio->dev,
+                                            vmmio->hdr.queue_sel);
+               ioport__write32(data, val);
+               break;
+       default:
+               break;
+       }
+}
+
+static void virtio_mmio_config_out(u64 addr, void *data, u32 len,
+                                  struct virtio_device *vdev)
+{
+       struct virtio_mmio *vmmio = vdev->virtio;
+       u32 val = 0;
+
+       switch (addr) {
+       case VIRTIO_MMIO_HOST_FEATURES_SEL:
+       case VIRTIO_MMIO_GUEST_FEATURES_SEL:
+       case VIRTIO_MMIO_QUEUE_SEL:
+       case VIRTIO_MMIO_STATUS:
+               val = ioport__read32(data);
+               *(u32 *)(((void *)&vmmio->hdr) + addr) = val;
+               break;
+       case VIRTIO_MMIO_GUEST_FEATURES:
+               if (vmmio->hdr.guest_features_sel == 0) {
+                       val = ioport__read32(data);
+                       vdev->ops->set_guest_features(vmmio->kvm,
+                                                     vmmio->dev, val);
+               }
+               break;
+       case VIRTIO_MMIO_GUEST_PAGE_SIZE:
+               val = ioport__read32(data);
+               vmmio->hdr.guest_page_size = val;
+               /* FIXME: set guest page size */
+               break;
+       case VIRTIO_MMIO_QUEUE_NUM:
+               val = ioport__read32(data);
+               vmmio->hdr.queue_num = val;
+               /* FIXME: set vq size */
+               vdev->ops->set_size_vq(vmmio->kvm, vmmio->dev,
+                                      vmmio->hdr.queue_sel, val);
+               break;
+       case VIRTIO_MMIO_QUEUE_ALIGN:
+               val = ioport__read32(data);
+               vmmio->hdr.queue_align = val;
+               /* FIXME: set used ring alignment */
+               break;
+       case VIRTIO_MMIO_QUEUE_PFN:
+               val = ioport__read32(data);
+               virtio_mmio_init_ioeventfd(vmmio->kvm, vdev, vmmio->hdr.queue_sel);
+               vdev->ops->init_vq(vmmio->kvm, vmmio->dev,
+                                  vmmio->hdr.queue_sel, val);
+               break;
+       case VIRTIO_MMIO_QUEUE_NOTIFY:
+               val = ioport__read32(data);
+               vdev->ops->notify_vq(vmmio->kvm, vmmio->dev, val);
+               break;
+       case VIRTIO_MMIO_INTERRUPT_ACK:
+               val = ioport__read32(data);
+               vmmio->hdr.interrupt_state &= ~val;
+               break;
+       default:
+               break;
+       };
+}
+
+static void virtio_mmio_mmio_callback(u64 addr, u8 *data, u32 len,
+                                     u8 is_write, void *ptr)
+{
+       struct virtio_device *vdev = ptr;
+       struct virtio_mmio *vmmio = vdev->virtio;
+       u32 offset = addr - vmmio->addr;
+
+       if (offset >= VIRTIO_MMIO_CONFIG) {
+               offset -= VIRTIO_MMIO_CONFIG;
+               virtio_mmio_device_specific(offset, data, len, is_write, ptr);
+               return;
+       }
+
+       if (is_write)
+               virtio_mmio_config_out(offset, data, len, ptr);
+       else
+               virtio_mmio_config_in(offset, data, len, ptr);
+}
+
+int virtio_mmio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+                    int device_id, int subsys_id, int class)
+{
+       struct virtio_mmio *vmmio = vdev->virtio;
+       u8 device, pin, line;
+
+       vmmio->addr     = virtio_mmio_get_io_space_block(VIRTIO_MMIO_IO_SIZE);
+       vmmio->kvm      = kvm;
+       vmmio->dev      = dev;
+
+       kvm__register_mmio(kvm, vmmio->addr, VIRTIO_MMIO_IO_SIZE,
+                          false, virtio_mmio_mmio_callback, vdev);
+
+       vmmio->hdr = (struct virtio_mmio_hdr) {
+               .magic          = {'v', 'i', 'r', 't'},
+               .version        = 1,
+               .device_id      = device_id - 0x1000 + 1,
+               .vendor_id      = 0x4d564b4c , /* 'LKVM' */
+               .queue_num_max  = 256,
+       };
+
+       if (irq__register_device(subsys_id, &device, &pin, &line) < 0)
+               return -1;
+       vmmio->irq = line;
+
+       /*
+        * Instantiate guest virtio-mmio devices using kernel command line
+        * (or module) parameter, e.g
+        *
+        * virtio_mmio.devices=0x200@0xd2000000:5,0x200@0xd2000200:6
+        */
+       pr_info("virtio-mmio.devices=0x%x@0x%x:%d\n", VIRTIO_MMIO_IO_SIZE, vmmio->addr, line);
+
+       return 0;
+}
+
+int virtio_mmio_exit(struct kvm *kvm, struct virtio_device *vdev)
+{
+       struct virtio_mmio *vmmio = vdev->virtio;
+       int i;
+
+       kvm__deregister_mmio(kvm, vmmio->addr);
+
+       for (i = 0; i < VIRTIO_MMIO_MAX_VQ; i++)
+               ioeventfd__del_event(vmmio->addr + VIRTIO_MMIO_QUEUE_NOTIFY, i);
+
+       return 0;
+}
diff --git a/tools/kvm/virtio/net.c b/tools/kvm/virtio/net.c

new file mode 100644 (file)

index 0000000..25bc3a4
--- /dev/null
+++ b/tools/kvm/virtio/net.c
@@ -0,0 +1,546 @@
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/virtio-net.h"
+#include "kvm/virtio.h"
+#include "kvm/types.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/irq.h"
+#include "kvm/uip.h"
+#include "kvm/guest_compat.h"
+
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <linux/if_tun.h>
+#include <linux/types.h>
+
+#include <arpa/inet.h>
+#include <net/if.h>
+
+#include <unistd.h>
+#include <fcntl.h>
+
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/eventfd.h>
+
+#define VIRTIO_NET_QUEUE_SIZE          256
+#define VIRTIO_NET_NUM_QUEUES          2
+#define VIRTIO_NET_RX_QUEUE            0
+#define VIRTIO_NET_TX_QUEUE            1
+
+struct net_dev;
+
+extern struct kvm *kvm;
+
+struct net_dev_operations {
+       int (*rx)(struct iovec *iov, u16 in, struct net_dev *ndev);
+       int (*tx)(struct iovec *iov, u16 in, struct net_dev *ndev);
+};
+
+struct net_dev {
+       pthread_mutex_t                 mutex;
+       struct virtio_device            vdev;
+       struct list_head                list;
+
+       struct virt_queue               vqs[VIRTIO_NET_NUM_QUEUES];
+       struct virtio_net_config        config;
+       u32                             features;
+
+       pthread_t                       io_rx_thread;
+       pthread_mutex_t                 io_rx_lock;
+       pthread_cond_t                  io_rx_cond;
+
+       pthread_t                       io_tx_thread;
+       pthread_mutex_t                 io_tx_lock;
+       pthread_cond_t                  io_tx_cond;
+
+       int                             vhost_fd;
+       int                             tap_fd;
+       char                            tap_name[IFNAMSIZ];
+
+       int                             mode;
+
+       struct uip_info                 info;
+       struct net_dev_operations       *ops;
+       struct kvm                      *kvm;
+};
+
+static LIST_HEAD(ndevs);
+static int compat_id = -1;
+
+static void *virtio_net_rx_thread(void *p)
+{
+       struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
+       struct virt_queue *vq;
+       struct kvm *kvm;
+       struct net_dev *ndev = p;
+       u16 out, in;
+       u16 head;
+       int len;
+
+       kvm = ndev->kvm;
+       vq = &ndev->vqs[VIRTIO_NET_RX_QUEUE];
+
+       while (1) {
+               mutex_lock(&ndev->io_rx_lock);
+               if (!virt_queue__available(vq))
+                       pthread_cond_wait(&ndev->io_rx_cond, &ndev->io_rx_lock);
+               mutex_unlock(&ndev->io_rx_lock);
+
+               while (virt_queue__available(vq)) {
+                       head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+                       len = ndev->ops->rx(iov, in, ndev);
+                       virt_queue__set_used_elem(vq, head, len);
+
+                       /* We should interrupt guest right now, otherwise latency is huge. */
+                       if (virtio_queue__should_signal(&ndev->vqs[VIRTIO_NET_RX_QUEUE]))
+                               ndev->vdev.ops->signal_vq(kvm, &ndev->vdev,
+                                                          VIRTIO_NET_RX_QUEUE);
+               }
+       }
+
+       pthread_exit(NULL);
+       return NULL;
+
+}
+
+static void *virtio_net_tx_thread(void *p)
+{
+       struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
+       struct virt_queue *vq;
+       struct kvm *kvm;
+       struct net_dev *ndev = p;
+       u16 out, in;
+       u16 head;
+       int len;
+
+       kvm = ndev->kvm;
+       vq = &ndev->vqs[VIRTIO_NET_TX_QUEUE];
+
+       while (1) {
+               mutex_lock(&ndev->io_tx_lock);
+               if (!virt_queue__available(vq))
+                       pthread_cond_wait(&ndev->io_tx_cond, &ndev->io_tx_lock);
+               mutex_unlock(&ndev->io_tx_lock);
+
+               while (virt_queue__available(vq)) {
+                       head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+                       len = ndev->ops->tx(iov, out, ndev);
+                       virt_queue__set_used_elem(vq, head, len);
+               }
+
+               if (virtio_queue__should_signal(&ndev->vqs[VIRTIO_NET_TX_QUEUE]))
+                       ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, VIRTIO_NET_TX_QUEUE);
+       }
+
+       pthread_exit(NULL);
+
+       return NULL;
+
+}
+
+static void virtio_net_handle_callback(struct kvm *kvm, struct net_dev *ndev, int queue)
+{
+       switch (queue) {
+       case VIRTIO_NET_TX_QUEUE:
+               mutex_lock(&ndev->io_tx_lock);
+               pthread_cond_signal(&ndev->io_tx_cond);
+               mutex_unlock(&ndev->io_tx_lock);
+               break;
+       case VIRTIO_NET_RX_QUEUE:
+               mutex_lock(&ndev->io_rx_lock);
+               pthread_cond_signal(&ndev->io_rx_cond);
+               mutex_unlock(&ndev->io_rx_lock);
+               break;
+       default:
+               pr_warning("Unknown queue index %u", queue);
+       }
+}
+
+static bool virtio_net__tap_init(const struct virtio_net_params *params,
+                                       struct net_dev *ndev)
+{
+       int sock = socket(AF_INET, SOCK_STREAM, 0);
+       int pid, status, offload, hdr_len;
+       struct sockaddr_in sin = {0};
+       struct ifreq ifr;
+
+       /* Did the user already gave us the FD? */
+       if (params->fd) {
+               ndev->tap_fd = params->fd;
+               return 1;
+       }
+
+       ndev->tap_fd = open("/dev/net/tun", O_RDWR);
+       if (ndev->tap_fd < 0) {
+               pr_warning("Unable to open /dev/net/tun");
+               goto fail;
+       }
+
+       memset(&ifr, 0, sizeof(ifr));
+       ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+       if (ioctl(ndev->tap_fd, TUNSETIFF, &ifr) < 0) {
+               pr_warning("Config tap device error. Are you root?");
+               goto fail;
+       }
+
+       strncpy(ndev->tap_name, ifr.ifr_name, sizeof(ndev->tap_name));
+
+       if (ioctl(ndev->tap_fd, TUNSETNOCSUM, 1) < 0) {
+               pr_warning("Config tap device TUNSETNOCSUM error");
+               goto fail;
+       }
+
+       hdr_len = sizeof(struct virtio_net_hdr);
+       if (ioctl(ndev->tap_fd, TUNSETVNETHDRSZ, &hdr_len) < 0)
+               pr_warning("Config tap device TUNSETVNETHDRSZ error");
+
+       offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_UFO;
+       if (ioctl(ndev->tap_fd, TUNSETOFFLOAD, offload) < 0) {
+               pr_warning("Config tap device TUNSETOFFLOAD error");
+               goto fail;
+       }
+
+       if (strcmp(params->script, "none")) {
+               pid = fork();
+               if (pid == 0) {
+                       execl(params->script, params->script, ndev->tap_name, NULL);
+                       _exit(1);
+               } else {
+                       waitpid(pid, &status, 0);
+                       if (WIFEXITED(status) && WEXITSTATUS(status) != 0) {
+                               pr_warning("Fail to setup tap by %s", params->script);
+                               goto fail;
+                       }
+               }
+       } else {
+               memset(&ifr, 0, sizeof(ifr));
+               strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ndev->tap_name));
+               sin.sin_addr.s_addr = inet_addr(params->host_ip);
+               memcpy(&(ifr.ifr_addr), &sin, sizeof(ifr.ifr_addr));
+               ifr.ifr_addr.sa_family = AF_INET;
+               if (ioctl(sock, SIOCSIFADDR, &ifr) < 0) {
+                       pr_warning("Could not set ip address on tap device");
+                       goto fail;
+               }
+       }
+
+       memset(&ifr, 0, sizeof(ifr));
+       strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ndev->tap_name));
+       ioctl(sock, SIOCGIFFLAGS, &ifr);
+       ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
+       if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0)
+               pr_warning("Could not bring tap device up");
+
+       close(sock);
+
+       return 1;
+
+fail:
+       if (sock >= 0)
+               close(sock);
+       if (ndev->tap_fd >= 0)
+               close(ndev->tap_fd);
+
+       return 0;
+}
+
+static void virtio_net__io_thread_init(struct kvm *kvm, struct net_dev *ndev)
+{
+       pthread_mutex_init(&ndev->io_tx_lock, NULL);
+       pthread_mutex_init(&ndev->io_rx_lock, NULL);
+
+       pthread_cond_init(&ndev->io_tx_cond, NULL);
+       pthread_cond_init(&ndev->io_rx_cond, NULL);
+
+       pthread_create(&ndev->io_tx_thread, NULL, virtio_net_tx_thread, ndev);
+       pthread_create(&ndev->io_rx_thread, NULL, virtio_net_rx_thread, ndev);
+}
+
+static inline int tap_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
+{
+       return writev(ndev->tap_fd, iov, out);
+}
+
+static inline int tap_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
+{
+       return readv(ndev->tap_fd, iov, in);
+}
+
+static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
+{
+       return uip_tx(iov, out, &ndev->info);
+}
+
+static inline int uip_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
+{
+       return uip_rx(iov, in, &ndev->info);
+}
+
+static struct net_dev_operations tap_ops = {
+       .rx     = tap_ops_rx,
+       .tx     = tap_ops_tx,
+};
+
+static struct net_dev_operations uip_ops = {
+       .rx     = uip_ops_rx,
+       .tx     = uip_ops_tx,
+};
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+       struct net_dev *ndev = dev;
+
+       return ((u8 *)(&ndev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+       return 1UL << VIRTIO_NET_F_MAC
+               | 1UL << VIRTIO_NET_F_CSUM
+               | 1UL << VIRTIO_NET_F_HOST_UFO
+               | 1UL << VIRTIO_NET_F_HOST_TSO4
+               | 1UL << VIRTIO_NET_F_HOST_TSO6
+               | 1UL << VIRTIO_NET_F_GUEST_UFO
+               | 1UL << VIRTIO_NET_F_GUEST_TSO4
+               | 1UL << VIRTIO_NET_F_GUEST_TSO6
+               | 1UL << VIRTIO_RING_F_EVENT_IDX
+               | 1UL << VIRTIO_RING_F_INDIRECT_DESC;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+       struct net_dev *ndev = dev;
+
+       ndev->features = features;
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 pfn)
+{
+       struct vhost_vring_state state = { .index = vq };
+       struct vhost_vring_addr addr;
+       struct net_dev *ndev = dev;
+       struct virt_queue *queue;
+       void *p;
+       int r;
+
+       compat__remove_message(compat_id);
+
+       queue           = &ndev->vqs[vq];
+       queue->pfn      = pfn;
+       p               = guest_pfn_to_host(kvm, queue->pfn);
+
+       /* FIXME: respect pci and mmio vring alignment */
+       vring_init(&queue->vring, VIRTIO_NET_QUEUE_SIZE, p, VIRTIO_PCI_VRING_ALIGN);
+
+       if (ndev->vhost_fd == 0)
+               return 0;
+
+       state.num = queue->vring.num;
+       r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_NUM, &state);
+       if (r < 0)
+               die_perror("VHOST_SET_VRING_NUM failed");
+       state.num = 0;
+       r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_BASE, &state);
+       if (r < 0)
+               die_perror("VHOST_SET_VRING_BASE failed");
+
+       addr = (struct vhost_vring_addr) {
+               .index = vq,
+               .desc_user_addr = (u64)(unsigned long)queue->vring.desc,
+               .avail_user_addr = (u64)(unsigned long)queue->vring.avail,
+               .used_user_addr = (u64)(unsigned long)queue->vring.used,
+       };
+
+       r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_ADDR, &addr);
+       if (r < 0)
+               die_perror("VHOST_SET_VRING_ADDR failed");
+
+       return 0;
+}
+
+static void notify_vq_gsi(struct kvm *kvm, void *dev, u32 vq, u32 gsi)
+{
+       struct net_dev *ndev = dev;
+       struct kvm_irqfd irq;
+       struct vhost_vring_file file;
+       int r;
+
+       if (ndev->vhost_fd == 0)
+               return;
+
+       irq = (struct kvm_irqfd) {
+               .gsi    = gsi,
+               .fd     = eventfd(0, 0),
+       };
+       file = (struct vhost_vring_file) {
+               .index  = vq,
+               .fd     = irq.fd,
+       };
+
+       r = ioctl(kvm->vm_fd, KVM_IRQFD, &irq);
+       if (r < 0)
+               die_perror("KVM_IRQFD failed");
+
+       r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_CALL, &file);
+       if (r < 0)
+               die_perror("VHOST_SET_VRING_CALL failed");
+       file.fd = ndev->tap_fd;
+       r = ioctl(ndev->vhost_fd, VHOST_NET_SET_BACKEND, &file);
+       if (r != 0)
+               die("VHOST_NET_SET_BACKEND failed %d", errno);
+
+}
+
+static void notify_vq_eventfd(struct kvm *kvm, void *dev, u32 vq, u32 efd)
+{
+       struct net_dev *ndev = dev;
+       struct vhost_vring_file file = {
+               .index  = vq,
+               .fd     = efd,
+       };
+       int r;
+
+       if (ndev->vhost_fd == 0)
+               return;
+
+       r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_KICK, &file);
+       if (r < 0)
+               die_perror("VHOST_SET_VRING_KICK failed");
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct net_dev *ndev = dev;
+
+       virtio_net_handle_callback(kvm, ndev, vq);
+
+       return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct net_dev *ndev = dev;
+
+       return ndev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       /* FIXME: dynamic */
+       return VIRTIO_NET_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+       /* FIXME: dynamic */
+       return size;
+}
+
+static struct virtio_ops net_dev_virtio_ops = (struct virtio_ops) {
+       .get_config             = get_config,
+       .get_host_features      = get_host_features,
+       .set_guest_features     = set_guest_features,
+       .init_vq                = init_vq,
+       .get_pfn_vq             = get_pfn_vq,
+       .get_size_vq            = get_size_vq,
+       .set_size_vq            = set_size_vq,
+       .notify_vq              = notify_vq,
+       .notify_vq_gsi          = notify_vq_gsi,
+       .notify_vq_eventfd      = notify_vq_eventfd,
+};
+
+static void virtio_net__vhost_init(struct kvm *kvm, struct net_dev *ndev)
+{
+       u64 features = 1UL << VIRTIO_RING_F_EVENT_IDX;
+       struct vhost_memory *mem;
+       int r;
+
+       ndev->vhost_fd = open("/dev/vhost-net", O_RDWR);
+       if (ndev->vhost_fd < 0)
+               die_perror("Failed openning vhost-net device");
+
+       mem = calloc(1, sizeof(*mem) + sizeof(struct vhost_memory_region));
+       if (mem == NULL)
+               die("Failed allocating memory for vhost memory map");
+
+       mem->nregions = 1;
+       mem->regions[0] = (struct vhost_memory_region) {
+               .guest_phys_addr        = 0,
+               .memory_size            = kvm->ram_size,
+               .userspace_addr         = (unsigned long)kvm->ram_start,
+       };
+
+       r = ioctl(ndev->vhost_fd, VHOST_SET_OWNER);
+       if (r != 0)
+               die_perror("VHOST_SET_OWNER failed");
+
+       r = ioctl(ndev->vhost_fd, VHOST_SET_FEATURES, &features);
+       if (r != 0)
+               die_perror("VHOST_SET_FEATURES failed");
+       r = ioctl(ndev->vhost_fd, VHOST_SET_MEM_TABLE, mem);
+       if (r != 0)
+               die_perror("VHOST_SET_MEM_TABLE failed");
+
+       ndev->vdev.use_vhost = true;
+
+       free(mem);
+}
+
+void virtio_net__init(const struct virtio_net_params *params)
+{
+       int i;
+       struct net_dev *ndev;
+
+       if (!params)
+               return;
+
+       ndev = calloc(1, sizeof(struct net_dev));
+       if (ndev == NULL)
+               die("Failed allocating ndev");
+
+       list_add_tail(&ndev->list, &ndevs);
+
+       ndev->kvm = params->kvm;
+
+       mutex_init(&ndev->mutex);
+       ndev->config.status = VIRTIO_NET_S_LINK_UP;
+
+       for (i = 0 ; i < 6 ; i++) {
+               ndev->config.mac[i]             = params->guest_mac[i];
+               ndev->info.guest_mac.addr[i]    = params->guest_mac[i];
+               ndev->info.host_mac.addr[i]     = params->host_mac[i];
+       }
+
+       ndev->mode = params->mode;
+       if (ndev->mode == NET_MODE_TAP) {
+               if (!virtio_net__tap_init(params, ndev))
+                       die_perror("You have requested a TAP device, but creation of one has failed because");
+               ndev->ops = &tap_ops;
+       } else {
+               ndev->info.host_ip              = ntohl(inet_addr(params->host_ip));
+               ndev->info.guest_ip             = ntohl(inet_addr(params->guest_ip));
+               ndev->info.guest_netmask        = ntohl(inet_addr("255.255.255.0"));
+               ndev->info.buf_nr               = 20,
+               uip_init(&ndev->info);
+               ndev->ops = &uip_ops;
+       }
+
+       if (params->trans && strcmp(params->trans, "mmio") == 0)
+               virtio_init(kvm, ndev, &ndev->vdev, &net_dev_virtio_ops,
+                           VIRTIO_MMIO, PCI_DEVICE_ID_VIRTIO_NET, VIRTIO_ID_NET, PCI_CLASS_NET);
+       else
+               virtio_init(kvm, ndev, &ndev->vdev, &net_dev_virtio_ops,
+                           VIRTIO_PCI, PCI_DEVICE_ID_VIRTIO_NET, VIRTIO_ID_NET, PCI_CLASS_NET);
+
+       if (params->vhost)
+               virtio_net__vhost_init(params->kvm, ndev);
+       else
+               virtio_net__io_thread_init(params->kvm, ndev);
+
+       if (compat_id == -1)
+               compat_id = virtio_compat_add_message("virtio-net", "CONFIG_VIRTIO_NET");
+}
diff --git a/tools/kvm/virtio/pci.c b/tools/kvm/virtio/pci.c

new file mode 100644 (file)

index 0000000..81f95ae
--- /dev/null
+++ b/tools/kvm/virtio/pci.c
@@ -0,0 +1,398 @@
+#include "kvm/virtio-pci.h"
+
+#include "kvm/ioport.h"
+#include "kvm/kvm.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/irq.h"
+#include "kvm/virtio.h"
+#include "kvm/ioeventfd.h"
+
+#include <sys/ioctl.h>
+#include <linux/virtio_pci.h>
+#include <linux/byteorder.h>
+#include <string.h>
+
+static void virtio_pci__ioevent_callback(struct kvm *kvm, void *param)
+{
+       struct virtio_pci_ioevent_param *ioeventfd = param;
+       struct virtio_pci *vpci = ioeventfd->vdev->virtio;
+
+       ioeventfd->vdev->ops->notify_vq(kvm, vpci->dev, ioeventfd->vq);
+}
+
+static int virtio_pci__init_ioeventfd(struct kvm *kvm, struct virtio_device *vdev, u32 vq)
+{
+       struct ioevent ioevent;
+       struct virtio_pci *vpci = vdev->virtio;
+       int r;
+
+       vpci->ioeventfds[vq] = (struct virtio_pci_ioevent_param) {
+               .vdev           = vdev,
+               .vq             = vq,
+       };
+
+       ioevent = (struct ioevent) {
+               .io_addr        = vpci->base_addr + VIRTIO_PCI_QUEUE_NOTIFY,
+               .io_len         = sizeof(u16),
+               .fn             = virtio_pci__ioevent_callback,
+               .fn_ptr         = &vpci->ioeventfds[vq],
+               .datamatch      = vq,
+               .fn_kvm         = kvm,
+               .fd             = eventfd(0, 0),
+       };
+
+       if (vdev->use_vhost)
+               /*
+                * Vhost will poll the eventfd in host kernel side,
+                * no need to poll in userspace.
+                */
+               r = ioeventfd__add_event(&ioevent, true, false);
+       else
+               /* Need to poll in userspace. */
+               r = ioeventfd__add_event(&ioevent, true, true);
+       if (r)
+               return r;
+
+       if (vdev->ops->notify_vq_eventfd)
+               vdev->ops->notify_vq_eventfd(kvm, vpci->dev, vq, ioevent.fd);
+
+       return 0;
+}
+
+static inline bool virtio_pci__msix_enabled(struct virtio_pci *vpci)
+{
+       return vpci->pci_hdr.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_ENABLE);
+}
+
+static bool virtio_pci__specific_io_in(struct kvm *kvm, struct virtio_device *vdev, u16 port,
+                                       void *data, int size, int offset)
+{
+       u32 config_offset;
+       struct virtio_pci *vpci = vdev->virtio;
+       int type = virtio__get_dev_specific_field(offset - 20,
+                                                       virtio_pci__msix_enabled(vpci),
+                                                       &config_offset);
+       if (type == VIRTIO_PCI_O_MSIX) {
+               switch (offset) {
+               case VIRTIO_MSI_CONFIG_VECTOR:
+                       ioport__write16(data, vpci->config_vector);
+                       break;
+               case VIRTIO_MSI_QUEUE_VECTOR:
+                       ioport__write16(data, vpci->vq_vector[vpci->queue_selector]);
+                       break;
+               };
+
+               return true;
+       } else if (type == VIRTIO_PCI_O_CONFIG) {
+               u8 cfg;
+
+               cfg = vdev->ops->get_config(kvm, vpci->dev)[config_offset];
+               ioport__write8(data, cfg);
+               return true;
+       }
+
+       return false;
+}
+
+static bool virtio_pci__io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       unsigned long offset;
+       bool ret = true;
+       struct virtio_device *vdev;
+       struct virtio_pci *vpci;
+       u32 val;
+
+       vdev = ioport->priv;
+       vpci = vdev->virtio;
+       offset = port - vpci->base_addr;
+
+       switch (offset) {
+       case VIRTIO_PCI_HOST_FEATURES:
+               val = vdev->ops->get_host_features(kvm, vpci->dev);
+               ioport__write32(data, val);
+               break;
+       case VIRTIO_PCI_QUEUE_PFN:
+               val = vdev->ops->get_pfn_vq(kvm, vpci->dev, vpci->queue_selector);
+               ioport__write32(data, val);
+               break;
+       case VIRTIO_PCI_QUEUE_NUM:
+               val = vdev->ops->get_size_vq(kvm, vpci->dev, vpci->queue_selector);
+               ioport__write16(data, val);
+               break;
+       case VIRTIO_PCI_STATUS:
+               ioport__write8(data, vpci->status);
+               break;
+       case VIRTIO_PCI_ISR:
+               ioport__write8(data, vpci->isr);
+               kvm__irq_line(kvm, vpci->pci_hdr.irq_line, VIRTIO_IRQ_LOW);
+               vpci->isr = VIRTIO_IRQ_LOW;
+               break;
+       default:
+               ret = virtio_pci__specific_io_in(kvm, vdev, port, data, size, offset);
+               break;
+       };
+
+       return ret;
+}
+
+static bool virtio_pci__specific_io_out(struct kvm *kvm, struct virtio_device *vdev, u16 port,
+                                       void *data, int size, int offset)
+{
+       struct virtio_pci *vpci = vdev->virtio;
+       u32 config_offset, gsi, vec;
+       int type = virtio__get_dev_specific_field(offset - 20, virtio_pci__msix_enabled(vpci),
+                                                       &config_offset);
+       if (type == VIRTIO_PCI_O_MSIX) {
+               switch (offset) {
+               case VIRTIO_MSI_CONFIG_VECTOR:
+                       vec = vpci->config_vector = ioport__read16(data);
+
+                       gsi = irq__add_msix_route(kvm, &vpci->msix_table[vec].msg);
+
+                       vpci->config_gsi = gsi;
+                       break;
+               case VIRTIO_MSI_QUEUE_VECTOR:
+                       vec = vpci->vq_vector[vpci->queue_selector] = ioport__read16(data);
+
+                       gsi = irq__add_msix_route(kvm, &vpci->msix_table[vec].msg);
+                       vpci->gsis[vpci->queue_selector] = gsi;
+                       if (vdev->ops->notify_vq_gsi)
+                               vdev->ops->notify_vq_gsi(kvm, vpci->dev,
+                                                       vpci->queue_selector, gsi);
+                       break;
+               };
+
+               return true;
+       } else if (type == VIRTIO_PCI_O_CONFIG) {
+               vdev->ops->get_config(kvm, vpci->dev)[config_offset] = *(u8 *)data;
+
+               return true;
+       }
+
+       return false;
+}
+
+static bool virtio_pci__io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       unsigned long offset;
+       bool ret = true;
+       struct virtio_device *vdev;
+       struct virtio_pci *vpci;
+       u32 val;
+
+       vdev = ioport->priv;
+       vpci = vdev->virtio;
+       offset = port - vpci->base_addr;
+
+       switch (offset) {
+       case VIRTIO_PCI_GUEST_FEATURES:
+               val = ioport__read32(data);
+               vdev->ops->set_guest_features(kvm, vpci->dev, val);
+               break;
+       case VIRTIO_PCI_QUEUE_PFN:
+               val = ioport__read32(data);
+               virtio_pci__init_ioeventfd(kvm, vdev, vpci->queue_selector);
+               vdev->ops->init_vq(kvm, vpci->dev, vpci->queue_selector, val);
+               break;
+       case VIRTIO_PCI_QUEUE_SEL:
+               vpci->queue_selector = ioport__read16(data);
+               break;
+       case VIRTIO_PCI_QUEUE_NOTIFY:
+               val = ioport__read16(data);
+               vdev->ops->notify_vq(kvm, vpci->dev, val);
+               break;
+       case VIRTIO_PCI_STATUS:
+               vpci->status = ioport__read8(data);
+               break;
+       default:
+               ret = virtio_pci__specific_io_out(kvm, vdev, port, data, size, offset);
+               break;
+       };
+
+       return ret;
+}
+
+static struct ioport_operations virtio_pci__io_ops = {
+       .io_in  = virtio_pci__io_in,
+       .io_out = virtio_pci__io_out,
+};
+
+static void virtio_pci__mmio_callback(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr)
+{
+       struct virtio_pci *vpci = ptr;
+       void *table;
+       u32 offset;
+
+       if (addr > vpci->msix_io_block + PCI_IO_SIZE) {
+               table   = &vpci->msix_pba;
+               offset  = vpci->msix_io_block + PCI_IO_SIZE;
+       } else {
+               table   = &vpci->msix_table;
+               offset  = vpci->msix_io_block;
+       }
+
+       if (is_write)
+               memcpy(table + addr - offset, data, len);
+       else
+               memcpy(data, table + addr - offset, len);
+}
+
+static void virtio_pci__signal_msi(struct kvm *kvm, struct virtio_pci *vpci, int vec)
+{
+       struct kvm_msi msi = {
+               .address_lo = vpci->msix_table[vec].msg.address_lo,
+               .address_hi = vpci->msix_table[vec].msg.address_hi,
+               .data = vpci->msix_table[vec].msg.data,
+       };
+
+       ioctl(kvm->vm_fd, KVM_SIGNAL_MSI, &msi);
+}
+
+int virtio_pci__signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq)
+{
+       struct virtio_pci *vpci = vdev->virtio;
+       int tbl = vpci->vq_vector[vq];
+
+       if (virtio_pci__msix_enabled(vpci)) {
+               if (vpci->pci_hdr.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_MASKALL) ||
+                   vpci->msix_table[tbl].ctrl & cpu_to_le16(PCI_MSIX_ENTRY_CTRL_MASKBIT)) {
+
+                       vpci->msix_pba |= 1 << tbl;
+                       return 0;
+               }
+
+               if (vpci->features & VIRTIO_PCI_F_SIGNAL_MSI)
+                       virtio_pci__signal_msi(kvm, vpci, vpci->vq_vector[vq]);
+               else
+                       kvm__irq_trigger(kvm, vpci->gsis[vq]);
+       } else {
+               vpci->isr = VIRTIO_IRQ_HIGH;
+               kvm__irq_trigger(kvm, vpci->pci_hdr.irq_line);
+       }
+       return 0;
+}
+
+int virtio_pci__signal_config(struct kvm *kvm, struct virtio_device *vdev)
+{
+       struct virtio_pci *vpci = vdev->virtio;
+       int tbl = vpci->config_vector;
+
+       if (virtio_pci__msix_enabled(vpci)) {
+               if (vpci->pci_hdr.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_MASKALL) ||
+                   vpci->msix_table[tbl].ctrl & cpu_to_le16(PCI_MSIX_ENTRY_CTRL_MASKBIT)) {
+
+                       vpci->msix_pba |= 1 << tbl;
+                       return 0;
+               }
+
+               if (vpci->features & VIRTIO_PCI_F_SIGNAL_MSI)
+                       virtio_pci__signal_msi(kvm, vpci, vpci->vq_vector[vpci->config_vector]);
+               else
+                       kvm__irq_trigger(kvm, vpci->config_gsi);
+       } else {
+               vpci->isr = VIRTIO_PCI_ISR_CONFIG;
+               kvm__irq_trigger(kvm, vpci->pci_hdr.irq_line);
+       }
+
+       return 0;
+}
+
+int virtio_pci__init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+                    int device_id, int subsys_id, int class)
+{
+       struct virtio_pci *vpci = vdev->virtio;
+       u8 pin, line, ndev;
+       int r;
+
+       vpci->dev = dev;
+       vpci->msix_io_block = pci_get_io_space_block(PCI_IO_SIZE * 2);
+
+       r = ioport__register(IOPORT_EMPTY, &virtio_pci__io_ops, IOPORT_SIZE, vdev);
+       if (r < 0)
+               return r;
+
+       vpci->base_addr = (u16)r;
+       r = kvm__register_mmio(kvm, vpci->msix_io_block, PCI_IO_SIZE, false,
+                              virtio_pci__mmio_callback, vpci);
+       if (r < 0)
+               goto free_ioport;
+
+       vpci->pci_hdr = (struct pci_device_header) {
+               .vendor_id              = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET),
+               .device_id              = cpu_to_le16(device_id),
+               .header_type            = PCI_HEADER_TYPE_NORMAL,
+               .revision_id            = 0,
+               .class[0]               = class & 0xff,
+               .class[1]               = (class >> 8) & 0xff,
+               .class[2]               = (class >> 16) & 0xff,
+               .subsys_vendor_id       = cpu_to_le16(PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET),
+               .subsys_id              = cpu_to_le16(subsys_id),
+               .bar[0]                 = cpu_to_le32(vpci->base_addr
+                                                       | PCI_BASE_ADDRESS_SPACE_IO),
+               .bar[1]                 = cpu_to_le32(vpci->msix_io_block
+                                                       | PCI_BASE_ADDRESS_SPACE_MEMORY),
+               .status                 = cpu_to_le16(PCI_STATUS_CAP_LIST),
+               .capabilities           = (void *)&vpci->pci_hdr.msix - (void *)&vpci->pci_hdr,
+               .bar_size[0]            = IOPORT_SIZE,
+               .bar_size[1]            = PCI_IO_SIZE,
+               .bar_size[3]            = PCI_IO_SIZE,
+       };
+
+       vpci->pci_hdr.msix.cap = PCI_CAP_ID_MSIX;
+       vpci->pci_hdr.msix.next = 0;
+       /*
+        * We at most have VIRTIO_PCI_MAX_VQ entries for virt queue,
+        * VIRTIO_PCI_MAX_CONFIG entries for config.
+        *
+        * To quote the PCI spec:
+        *
+        * System software reads this field to determine the
+        * MSI-X Table Size N, which is encoded as N-1.
+        * For example, a returned value of "00000000011"
+        * indicates a table size of 4.
+        */
+       vpci->pci_hdr.msix.ctrl = cpu_to_le16(VIRTIO_PCI_MAX_VQ + VIRTIO_PCI_MAX_CONFIG - 1);
+
+       /*
+        * Both table and PBA could be mapped on the same BAR, but for now
+        * we're not in short of BARs
+        */
+       vpci->pci_hdr.msix.table_offset = cpu_to_le32(1); /* Use BAR 1 */
+       vpci->pci_hdr.msix.pba_offset = cpu_to_le32(1 | PCI_IO_SIZE); /* Use BAR 3 */
+       vpci->config_vector = 0;
+
+       r = irq__register_device(subsys_id, &ndev, &pin, &line);
+       if (r < 0)
+               goto free_mmio;
+
+       if (kvm__supports_extension(kvm, KVM_CAP_SIGNAL_MSI))
+               vpci->features |= VIRTIO_PCI_F_SIGNAL_MSI;
+
+       vpci->pci_hdr.irq_pin   = pin;
+       vpci->pci_hdr.irq_line  = line;
+       r = pci__register(&vpci->pci_hdr, ndev);
+       if (r < 0)
+               goto free_ioport;
+
+       return 0;
+
+free_mmio:
+       kvm__deregister_mmio(kvm, vpci->msix_io_block);
+free_ioport:
+       ioport__unregister(vpci->base_addr);
+       return r;
+}
+
+int virtio_pci__exit(struct kvm *kvm, struct virtio_device *vdev)
+{
+       struct virtio_pci *vpci = vdev->virtio;
+       int i;
+
+       kvm__deregister_mmio(kvm, vpci->msix_io_block);
+       ioport__unregister(vpci->base_addr);
+
+       for (i = 0; i < VIRTIO_PCI_MAX_VQ; i++)
+               ioeventfd__del_event(vpci->base_addr + VIRTIO_PCI_QUEUE_NOTIFY, i);
+
+       return 0;
+}
diff --git a/tools/kvm/virtio/rng.c b/tools/kvm/virtio/rng.c

new file mode 100644 (file)

index 0000000..2b1ab39
--- /dev/null
+++ b/tools/kvm/virtio/rng.c
@@ -0,0 +1,188 @@
+#include "kvm/virtio-rng.h"
+
+#include "kvm/virtio-pci-dev.h"
+
+#include "kvm/virtio.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/threadpool.h"
+#include "kvm/guest_compat.h"
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_rng.h>
+
+#include <linux/list.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <pthread.h>
+#include <linux/kernel.h>
+
+#define NUM_VIRT_QUEUES                1
+#define VIRTIO_RNG_QUEUE_SIZE  128
+
+struct rng_dev_job {
+       struct virt_queue       *vq;
+       struct rng_dev          *rdev;
+       struct thread_pool__job job_id;
+};
+
+struct rng_dev {
+       struct list_head        list;
+       struct virtio_device    vdev;
+
+       int                     fd;
+
+       /* virtio queue */
+       struct virt_queue       vqs[NUM_VIRT_QUEUES];
+       struct rng_dev_job      jobs[NUM_VIRT_QUEUES];
+};
+
+static LIST_HEAD(rdevs);
+static int compat_id = -1;
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+       /* Unused */
+       return 0;
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+       /* Unused */
+       return 0;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+       /* Unused */
+}
+
+static bool virtio_rng_do_io_request(struct kvm *kvm, struct rng_dev *rdev, struct virt_queue *queue)
+{
+       struct iovec iov[VIRTIO_RNG_QUEUE_SIZE];
+       unsigned int len = 0;
+       u16 out, in, head;
+
+       head    = virt_queue__get_iov(queue, iov, &out, &in, kvm);
+       len     = readv(rdev->fd, iov, in);
+
+       virt_queue__set_used_elem(queue, head, len);
+
+       return true;
+}
+
+static void virtio_rng_do_io(struct kvm *kvm, void *param)
+{
+       struct rng_dev_job *job = param;
+       struct virt_queue *vq   = job->vq;
+       struct rng_dev *rdev    = job->rdev;
+
+       while (virt_queue__available(vq))
+               virtio_rng_do_io_request(kvm, rdev, vq);
+
+       rdev->vdev.ops->signal_vq(kvm, &rdev->vdev, vq - rdev->vqs);
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 pfn)
+{
+       struct rng_dev *rdev = dev;
+       struct virt_queue *queue;
+       struct rng_dev_job *job;
+       void *p;
+
+       compat__remove_message(compat_id);
+
+       queue           = &rdev->vqs[vq];
+       queue->pfn      = pfn;
+       p               = guest_pfn_to_host(kvm, queue->pfn);
+
+       job = &rdev->jobs[vq];
+
+       vring_init(&queue->vring, VIRTIO_RNG_QUEUE_SIZE, p, VIRTIO_PCI_VRING_ALIGN);
+
+       *job = (struct rng_dev_job) {
+               .vq     = queue,
+               .rdev   = rdev,
+       };
+
+       thread_pool__init_job(&job->job_id, kvm, virtio_rng_do_io, job);
+
+       return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct rng_dev *rdev = dev;
+
+       thread_pool__do_job(&rdev->jobs[vq].job_id);
+
+       return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct rng_dev *rdev = dev;
+
+       return rdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       return VIRTIO_RNG_QUEUE_SIZE;
+}
+
+static struct virtio_ops rng_dev_virtio_ops = (struct virtio_ops) {
+       .get_config             = get_config,
+       .get_host_features      = get_host_features,
+       .set_guest_features     = set_guest_features,
+       .init_vq                = init_vq,
+       .notify_vq              = notify_vq,
+       .get_pfn_vq             = get_pfn_vq,
+       .get_size_vq            = get_size_vq,
+};
+
+int virtio_rng__init(struct kvm *kvm)
+{
+       struct rng_dev *rdev;
+       int r;
+
+       rdev = malloc(sizeof(*rdev));
+       if (rdev == NULL)
+               return -ENOMEM;
+
+       rdev->fd = open("/dev/urandom", O_RDONLY);
+       if (rdev->fd < 0) {
+               r = rdev->fd;
+               goto cleanup;
+       }
+
+       r = virtio_init(kvm, rdev, &rdev->vdev, &rng_dev_virtio_ops,
+                       VIRTIO_PCI, PCI_DEVICE_ID_VIRTIO_RNG, VIRTIO_ID_RNG, PCI_CLASS_RNG);
+       if (r < 0)
+               goto cleanup;
+
+       list_add_tail(&rdev->list, &rdevs);
+
+       if (compat_id == -1)
+               compat_id = virtio_compat_add_message("virtio-rng", "CONFIG_HW_RANDOM_VIRTIO");
+       return 0;
+cleanup:
+       close(rdev->fd);
+       free(rdev);
+
+       return r;
+}
+
+int virtio_rng__exit(struct kvm *kvm)
+{
+       struct rng_dev *rdev, *tmp;
+
+       list_for_each_entry_safe(rdev, tmp, &rdevs, list) {
+               list_del(&rdev->list);
+               rdev->vdev.ops->exit(kvm, &rdev->vdev);
+               free(rdev);
+       }
+
+       return 0;
+}
diff --git a/tools/kvm/virtio/scsi.c b/tools/kvm/virtio/scsi.c

new file mode 100644 (file)

index 0000000..c445f08
--- /dev/null
+++ b/tools/kvm/virtio/scsi.c
@@ -0,0 +1,307 @@
+#include "kvm/virtio-scsi.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/disk-image.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/guest_compat.h"
+#include "kvm/virtio-pci.h"
+#include "kvm/virtio.h"
+
+#include <linux/kernel.h>
+#include <linux/virtio_scsi.h>
+#include <linux/vhost.h>
+
+#define VIRTIO_SCSI_QUEUE_SIZE         128
+#define NUM_VIRT_QUEUES                        3
+
+static LIST_HEAD(sdevs);
+static int compat_id = -1;
+
+struct scsi_dev {
+       struct virt_queue               vqs[NUM_VIRT_QUEUES];
+       struct virtio_scsi_config       config;
+       struct vhost_scsi_target        target;
+       u32                             features;
+       int                             vhost_fd;
+       struct virtio_device            vdev;
+       struct list_head                list;
+       struct kvm                      *kvm;
+};
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+       struct scsi_dev *sdev = dev;
+
+       return ((u8 *)(&sdev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+       return  1UL << VIRTIO_RING_F_EVENT_IDX |
+               1UL << VIRTIO_RING_F_INDIRECT_DESC;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+       struct scsi_dev *sdev = dev;
+
+       sdev->features = features;
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 pfn)
+{
+       struct vhost_vring_state state = { .index = vq };
+       struct vhost_vring_addr addr;
+       struct scsi_dev *sdev = dev;
+       struct virt_queue *queue;
+       void *p;
+       int r;
+
+       compat__remove_message(compat_id);
+
+       queue           = &sdev->vqs[vq];
+       queue->pfn      = pfn;
+       p               = guest_pfn_to_host(kvm, queue->pfn);
+
+       vring_init(&queue->vring, VIRTIO_SCSI_QUEUE_SIZE, p, VIRTIO_PCI_VRING_ALIGN);
+
+       if (sdev->vhost_fd == 0)
+               return 0;
+
+       state.num = queue->vring.num;
+       r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_NUM, &state);
+       if (r < 0)
+               die_perror("VHOST_SET_VRING_NUM failed");
+       state.num = 0;
+       r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_BASE, &state);
+       if (r < 0)
+               die_perror("VHOST_SET_VRING_BASE failed");
+
+       addr = (struct vhost_vring_addr) {
+               .index = vq,
+               .desc_user_addr = (u64)(unsigned long)queue->vring.desc,
+               .avail_user_addr = (u64)(unsigned long)queue->vring.avail,
+               .used_user_addr = (u64)(unsigned long)queue->vring.used,
+       };
+
+       r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_ADDR, &addr);
+       if (r < 0)
+               die_perror("VHOST_SET_VRING_ADDR failed");
+
+       return 0;
+}
+
+static void notify_vq_gsi(struct kvm *kvm, void *dev, u32 vq, u32 gsi)
+{
+       struct vhost_vring_file file;
+       struct scsi_dev *sdev = dev;
+       struct kvm_irqfd irq;
+       int r;
+
+       if (sdev->vhost_fd == 0)
+               return;
+
+       irq = (struct kvm_irqfd) {
+               .gsi    = gsi,
+               .fd     = eventfd(0, 0),
+       };
+       file = (struct vhost_vring_file) {
+               .index  = vq,
+               .fd     = irq.fd,
+       };
+
+       r = ioctl(kvm->vm_fd, KVM_IRQFD, &irq);
+       if (r < 0)
+               die_perror("KVM_IRQFD failed");
+
+       r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_CALL, &file);
+       if (r < 0)
+               die_perror("VHOST_SET_VRING_CALL failed");
+
+       if (vq > 0)
+               return;
+
+       r = ioctl(sdev->vhost_fd, VHOST_SCSI_SET_ENDPOINT, &sdev->target);
+       if (r != 0)
+               die("VHOST_SCSI_SET_ENDPOINT failed %d", errno);
+}
+
+static void notify_vq_eventfd(struct kvm *kvm, void *dev, u32 vq, u32 efd)
+{
+       struct scsi_dev *sdev = dev;
+       struct vhost_vring_file file = {
+               .index  = vq,
+               .fd     = efd,
+       };
+       int r;
+
+       if (sdev->vhost_fd == 0)
+               return;
+
+       r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_KICK, &file);
+       if (r < 0)
+               die_perror("VHOST_SET_VRING_KICK failed");
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct scsi_dev *sdev = dev;
+
+       return sdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       return VIRTIO_SCSI_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+       return size;
+}
+
+static struct virtio_ops scsi_dev_virtio_ops = (struct virtio_ops) {
+       .get_config             = get_config,
+       .get_host_features      = get_host_features,
+       .set_guest_features     = set_guest_features,
+       .init_vq                = init_vq,
+       .get_pfn_vq             = get_pfn_vq,
+       .get_size_vq            = get_size_vq,
+       .set_size_vq            = set_size_vq,
+       .notify_vq              = notify_vq,
+       .notify_vq_gsi          = notify_vq_gsi,
+       .notify_vq_eventfd      = notify_vq_eventfd,
+};
+
+static void virtio_scsi_vhost_init(struct kvm *kvm, struct scsi_dev *sdev)
+{
+       struct vhost_memory *mem;
+       u64 features;
+       int r;
+
+       sdev->vhost_fd = open("/dev/vhost-scsi", O_RDWR);
+       if (sdev->vhost_fd < 0)
+               die_perror("Failed openning vhost-scsi device");
+
+       mem = calloc(1, sizeof(*mem) + sizeof(struct vhost_memory_region));
+       if (mem == NULL)
+               die("Failed allocating memory for vhost memory map");
+
+       mem->nregions = 1;
+       mem->regions[0] = (struct vhost_memory_region) {
+               .guest_phys_addr        = 0,
+               .memory_size            = kvm->ram_size,
+               .userspace_addr         = (unsigned long)kvm->ram_start,
+       };
+
+       r = ioctl(sdev->vhost_fd, VHOST_SET_OWNER);
+       if (r != 0)
+               die_perror("VHOST_SET_OWNER failed");
+
+       r = ioctl(sdev->vhost_fd, VHOST_GET_FEATURES, &features);
+       if (r != 0)
+               die_perror("VHOST_GET_FEATURES failed");
+
+       r = ioctl(sdev->vhost_fd, VHOST_SET_FEATURES, &features);
+       if (r != 0)
+               die_perror("VHOST_SET_FEATURES failed");
+       r = ioctl(sdev->vhost_fd, VHOST_SET_MEM_TABLE, mem);
+       if (r != 0)
+               die_perror("VHOST_SET_MEM_TABLE failed");
+
+       sdev->vdev.use_vhost = true;
+
+       free(mem);
+}
+
+
+static int virtio_scsi_init_one(struct kvm *kvm, struct disk_image *disk)
+{
+       struct scsi_dev *sdev;
+
+       if (!disk)
+               return -EINVAL;
+
+       sdev = calloc(1, sizeof(struct scsi_dev));
+       if (sdev == NULL)
+               return -ENOMEM;
+
+       *sdev = (struct scsi_dev) {
+               .config = (struct virtio_scsi_config) {
+                       .num_queues     = NUM_VIRT_QUEUES - 2,
+                       .seg_max        = VIRTIO_SCSI_CDB_SIZE - 2,
+                       .max_sectors    = 65535,
+                       .cmd_per_lun    = 128,
+                       .sense_size     = VIRTIO_SCSI_SENSE_SIZE,
+                       .cdb_size       = VIRTIO_SCSI_CDB_SIZE,
+                       .max_channel    = 0,
+                       .max_target     = 0,
+                       .max_lun        = 16383,
+                       .event_info_size = sizeof(struct virtio_scsi_event),
+               },
+               .kvm                    = kvm,
+       };
+       strncpy((char *)&sdev->target.vhost_wwpn, disk->wwpn, sizeof(sdev->target.vhost_wwpn));
+       sdev->target.vhost_tpgt = strtol(disk->tpgt, NULL, 0);
+
+       virtio_init(kvm, sdev, &sdev->vdev, &scsi_dev_virtio_ops,
+                   VIRTIO_PCI, PCI_DEVICE_ID_VIRTIO_SCSI, VIRTIO_ID_SCSI, PCI_CLASS_BLK);
+
+       list_add_tail(&sdev->list, &sdevs);
+
+       virtio_scsi_vhost_init(kvm, sdev);
+
+       if (compat_id == -1)
+               compat_id = virtio_compat_add_message("virtio-scsi", "CONFIG_VIRTIO_SCSI");
+
+       return 0;
+}
+
+static int virtio_scsi_exit_one(struct kvm *kvm, struct scsi_dev *sdev)
+{
+       int r;
+
+       r = ioctl(sdev->vhost_fd, VHOST_SCSI_CLEAR_ENDPOINT, &sdev->target);
+       if (r != 0)
+               die("VHOST_SCSI_CLEAR_ENDPOINT failed %d", errno);
+
+       list_del(&sdev->list);
+       free(sdev);
+
+       return 0;
+}
+
+int virtio_scsi_init(struct kvm *kvm)
+{
+       int i, r = 0;
+
+       for (i = 0; i < kvm->nr_disks; i++) {
+               if (!kvm->disks[i]->wwpn)
+                       continue;
+               r = virtio_scsi_init_one(kvm, kvm->disks[i]);
+               if (r < 0)
+                       goto cleanup;
+       }
+
+       return 0;
+cleanup:
+       return virtio_scsi_exit(kvm);
+}
+
+int virtio_scsi_exit(struct kvm *kvm)
+{
+       while (!list_empty(&sdevs)) {
+               struct scsi_dev *sdev;
+
+               sdev = list_first_entry(&sdevs, struct scsi_dev, list);
+               virtio_scsi_exit_one(kvm, sdev);
+       }
+
+       return 0;
+}
diff --git a/tools/kvm/x86/bios.c b/tools/kvm/x86/bios.c

new file mode 100644 (file)

index 0000000..0f1bd85
--- /dev/null
+++ b/tools/kvm/x86/bios.c
@@ -0,0 +1,174 @@
+#include "kvm/kvm.h"
+#include "kvm/boot-protocol.h"
+#include "kvm/e820.h"
+#include "kvm/interrupt.h"
+#include "kvm/util.h"
+
+#include <string.h>
+#include <asm/e820.h>
+
+#include "bios/bios-rom.h"
+
+struct irq_handler {
+       unsigned long           address;
+       unsigned int            irq;
+       void                    *handler;
+       size_t                  size;
+};
+
+#define BIOS_IRQ_PA_ADDR(name) (MB_BIOS_BEGIN + BIOS_OFFSET__##name)
+#define BIOS_IRQ_FUNC(name)    ((char *)&bios_rom[BIOS_OFFSET__##name])
+#define BIOS_IRQ_SIZE(name)    (BIOS_ENTRY_SIZE(BIOS_OFFSET__##name))
+
+#define DEFINE_BIOS_IRQ_HANDLER(_irq, _handler)                        \
+       {                                                       \
+               .irq            = _irq,                         \
+               .address        = BIOS_IRQ_PA_ADDR(_handler),   \
+               .handler        = BIOS_IRQ_FUNC(_handler),      \
+               .size           = BIOS_IRQ_SIZE(_handler),      \
+       }
+
+static struct irq_handler bios_irq_handlers[] = {
+       DEFINE_BIOS_IRQ_HANDLER(0x10, bios_int10),
+       DEFINE_BIOS_IRQ_HANDLER(0x15, bios_int15),
+};
+
+static void setup_irq_handler(struct kvm *kvm, struct irq_handler *handler)
+{
+       struct real_intr_desc intr_desc;
+       void *p;
+
+       p = guest_flat_to_host(kvm, handler->address);
+       memcpy(p, handler->handler, handler->size);
+
+       intr_desc = (struct real_intr_desc) {
+               .segment        = REAL_SEGMENT(MB_BIOS_BEGIN),
+               .offset         = handler->address - MB_BIOS_BEGIN,
+       };
+
+       DIE_IF((handler->address - MB_BIOS_BEGIN) > 0xffffUL);
+
+       interrupt_table__set(&kvm->interrupt_table, &intr_desc, handler->irq);
+}
+
+/**
+ * e820_setup - setup some simple E820 memory map
+ * @kvm - guest system descriptor
+ */
+static void e820_setup(struct kvm *kvm)
+{
+       struct e820map *e820;
+       struct e820entry *mem_map;
+       unsigned int i = 0;
+
+       e820            = guest_flat_to_host(kvm, E820_MAP_START);
+       mem_map         = e820->map;
+
+       mem_map[i++]    = (struct e820entry) {
+               .addr           = REAL_MODE_IVT_BEGIN,
+               .size           = EBDA_START - REAL_MODE_IVT_BEGIN,
+               .type           = E820_RAM,
+       };
+       mem_map[i++]    = (struct e820entry) {
+               .addr           = EBDA_START,
+               .size           = VGA_RAM_BEGIN - EBDA_START,
+               .type           = E820_RESERVED,
+       };
+       mem_map[i++]    = (struct e820entry) {
+               .addr           = MB_BIOS_BEGIN,
+               .size           = MB_BIOS_END - MB_BIOS_BEGIN,
+               .type           = E820_RESERVED,
+       };
+       if (kvm->ram_size < KVM_32BIT_GAP_START) {
+               mem_map[i++]    = (struct e820entry) {
+                       .addr           = BZ_KERNEL_START,
+                       .size           = kvm->ram_size - BZ_KERNEL_START,
+                       .type           = E820_RAM,
+               };
+       } else {
+               mem_map[i++]    = (struct e820entry) {
+                       .addr           = BZ_KERNEL_START,
+                       .size           = KVM_32BIT_GAP_START - BZ_KERNEL_START,
+                       .type           = E820_RAM,
+               };
+               mem_map[i++]    = (struct e820entry) {
+                       .addr           = KVM_32BIT_MAX_MEM_SIZE,
+                       .size           = kvm->ram_size - KVM_32BIT_MAX_MEM_SIZE,
+                       .type           = E820_RAM,
+               };
+       }
+
+       BUG_ON(i > E820_X_MAX);
+
+       e820->nr_map = i;
+}
+
+static void setup_vga_rom(struct kvm *kvm)
+{
+       u16 *mode;
+       void *p;
+
+       p = guest_flat_to_host(kvm, VGA_ROM_OEM_STRING);
+       memset(p, 0, VGA_ROM_OEM_STRING_SIZE);
+       strncpy(p, "KVM VESA", VGA_ROM_OEM_STRING_SIZE);
+
+       mode = guest_flat_to_host(kvm, VGA_ROM_MODES);
+       mode[0] = 0x0112;
+       mode[1] = 0xffff;
+}
+
+/**
+ * setup_bios - inject BIOS into guest memory
+ * @kvm - guest system descriptor
+ */
+void setup_bios(struct kvm *kvm)
+{
+       unsigned long address = MB_BIOS_BEGIN;
+       struct real_intr_desc intr_desc;
+       unsigned int i;
+       void *p;
+
+       /*
+        * before anything else -- clean some known areas
+        * we definitely don't want any trash here
+        */
+       p = guest_flat_to_host(kvm, BDA_START);
+       memset(p, 0, BDA_END - BDA_START);
+
+       p = guest_flat_to_host(kvm, EBDA_START);
+       memset(p, 0, EBDA_END - EBDA_START);
+
+       p = guest_flat_to_host(kvm, MB_BIOS_BEGIN);
+       memset(p, 0, MB_BIOS_END - MB_BIOS_BEGIN);
+
+       p = guest_flat_to_host(kvm, VGA_ROM_BEGIN);
+       memset(p, 0, VGA_ROM_END - VGA_ROM_BEGIN);
+
+       /* just copy the bios rom into the place */
+       p = guest_flat_to_host(kvm, MB_BIOS_BEGIN);
+       memcpy(p, bios_rom, bios_rom_size);
+
+       /* E820 memory map must be present */
+       e820_setup(kvm);
+
+       /* VESA needs own tricks */
+       setup_vga_rom(kvm);
+
+       /*
+        * Setup a *fake* real mode vector table, it has only
+        * one real handler which does just iret
+        */
+       address = BIOS_IRQ_PA_ADDR(bios_intfake);
+       intr_desc = (struct real_intr_desc) {
+               .segment        = REAL_SEGMENT(MB_BIOS_BEGIN),
+               .offset         = address - MB_BIOS_BEGIN,
+       };
+       interrupt_table__setup(&kvm->interrupt_table, &intr_desc);
+
+       for (i = 0; i < ARRAY_SIZE(bios_irq_handlers); i++)
+               setup_irq_handler(kvm, &bios_irq_handlers[i]);
+
+       /* we almost done */
+       p = guest_flat_to_host(kvm, 0);
+       interrupt_table__copy(&kvm->interrupt_table, p, REAL_INTR_SIZE);
+}
diff --git a/tools/kvm/x86/bios/.gitignore b/tools/kvm/x86/bios/.gitignore

new file mode 100644 (file)

index 0000000..1f0080b
--- /dev/null
+++ b/tools/kvm/x86/bios/.gitignore
@@ -0,0 +1,3 @@
+bios-rom.bin
+bios-rom.bin.elf
+bios-rom.h
diff --git a/tools/kvm/x86/bios/bios-rom.S b/tools/kvm/x86/bios/bios-rom.S

new file mode 100644 (file)

index 0000000..3269ce9
--- /dev/null
+++ b/tools/kvm/x86/bios/bios-rom.S
@@ -0,0 +1,12 @@
+#include <kvm/assembly.h>
+
+       .org 0
+#ifdef CONFIG_X86_64
+       .code64
+#else
+       .code32
+#endif
+
+GLOBAL(bios_rom)
+       .incbin "x86/bios/bios.bin"
+END(bios_rom)
diff --git a/tools/kvm/x86/bios/e820.c b/tools/kvm/x86/bios/e820.c

new file mode 100644 (file)

index 0000000..a9bca29
--- /dev/null
+++ b/tools/kvm/x86/bios/e820.c
@@ -0,0 +1,72 @@
+#include "kvm/e820.h"
+
+#include "kvm/segment.h"
+#include "kvm/bios.h"
+
+#include <asm/processor-flags.h>
+#include <asm/e820.h>
+
+static inline void set_fs(u16 seg)
+{
+       asm volatile("movw %0,%%fs" : : "rm" (seg));
+}
+
+static inline u8 rdfs8(unsigned long addr)
+{
+       u8 v;
+
+       asm volatile("addr32 movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
+
+       return v;
+}
+
+static inline u32 rdfs32(unsigned long addr)
+{
+       u32 v;
+
+       asm volatile("addr32 movl %%fs:%1,%0" : "=q" (v) : "m" (*(u32 *)addr));
+
+       return v;
+}
+
+bioscall void e820_query_map(struct biosregs *regs)
+{
+       struct e820map *e820;
+       u32 map_size;
+       u16 fs_seg;
+       u32 ndx;
+
+       e820            = (struct e820map *)E820_MAP_START;
+       fs_seg          = flat_to_seg16(E820_MAP_START);
+       set_fs(fs_seg);
+
+       ndx             = regs->ebx;
+
+       map_size        = rdfs32(flat_to_off16((u32)&e820->nr_map, fs_seg));
+
+       if (ndx < map_size) {
+               u32 start;
+               unsigned int i;
+               u8 *p;
+
+               fs_seg  = flat_to_seg16(E820_MAP_START);
+               set_fs(fs_seg);
+
+               start   = (u32)&e820->map[ndx];
+
+               p       = (void *) regs->edi;
+
+               for (i = 0; i < sizeof(struct e820entry); i++)
+                       *p++    = rdfs8(flat_to_off16(start + i, fs_seg));
+       }
+
+       regs->eax       = SMAP;
+       regs->ecx       = sizeof(struct e820entry);
+       regs->ebx       = ++ndx;
+
+       /* Clear CF to indicate success.  */
+       regs->eflags    &= ~X86_EFLAGS_CF;
+
+       if (ndx >= map_size)
+               regs->ebx       = 0;    /* end of map */
+}
diff --git a/tools/kvm/x86/bios/entry.S b/tools/kvm/x86/bios/entry.S

new file mode 100644 (file)

index 0000000..85056e9
--- /dev/null
+++ b/tools/kvm/x86/bios/entry.S
@@ -0,0 +1,92 @@
+/*
+ * Our pretty trivial BIOS emulation
+ */
+
+#include <kvm/bios.h>
+#include <kvm/assembly.h>
+
+       .org 0
+       .code16gcc
+
+#define EFLAGS_CF      (1 << 0)
+
+#include "macro.S"
+
+/* If you change these macros, remember to update 'struct biosregs' */
+.macro SAVE_BIOSREGS
+       pushl   %fs
+       pushl   %es
+       pushl   %ds
+       pushl   %edi
+       pushl   %esi
+       pushl   %ebp
+       pushl   %esp
+       pushl   %edx
+       pushl   %ecx
+       pushl   %ebx
+       pushl   %eax
+.endm
+
+.macro RESTORE_BIOSREGS
+       popl    %eax
+       popl    %ebx
+       popl    %ecx
+       popl    %edx
+       popl    %esp
+       popl    %ebp
+       popl    %esi
+       popl    %edi
+       popl    %ds
+       popl    %es
+       popl    %fs
+.endm
+
+/*
+ * fake interrupt handler, nothing can be faster ever
+ */
+ENTRY(bios_intfake)
+       /*
+        * Set CF to indicate failure. We don't want callers to think that the
+        * interrupt handler succeeded and then treat the return values in
+        * registers as valid data.
+        */
+       orl     $EFLAGS_CF, 0x4(%esp)
+
+       IRET
+ENTRY_END(bios_intfake)
+
+/*
+ * int 10 - video - service
+ */
+ENTRY(bios_int10)
+       SAVE_BIOSREGS
+
+       movl            %esp, %eax
+       /* this is way easier than doing it in assembly */
+       /* just push all the regs and jump to a C handler */
+       call    int10_handler
+
+       RESTORE_BIOSREGS
+
+       /* Clear CF to indicate success.  */
+       andl    $~EFLAGS_CF, 0x4(%esp)
+
+       IRET
+ENTRY_END(bios_int10)
+
+ENTRY(bios_int15)
+       SAVE_BIOSREGS
+
+       movl    %esp, %eax
+       call    int15_handler
+
+       RESTORE_BIOSREGS
+
+       IRET
+ENTRY_END(bios_int15)
+
+GLOBAL(__locals)
+
+#include "local.S"
+
+END(__locals)
diff --git a/tools/kvm/x86/bios/gen-offsets.sh b/tools/kvm/x86/bios/gen-offsets.sh

new file mode 100644 (file)

index 0000000..8771bbe
--- /dev/null
+++ b/tools/kvm/x86/bios/gen-offsets.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+echo "/* Autogenerated file, don't edit */"
+echo "#ifndef BIOS_OFFSETS_H"
+echo "#define BIOS_OFFSETS_H"
+
+echo ""
+echo "#define BIOS_ENTRY_SIZE(name) (name##_end - name)"
+echo ""
+
+nm bios.bin.elf | grep ' [Tt] ' | awk '{ print "#define BIOS_OFFSET__" $3 " 0x" $1; }'
+
+echo ""
+echo "#endif"
diff --git a/tools/kvm/x86/bios/int10.c b/tools/kvm/x86/bios/int10.c

new file mode 100644 (file)

index 0000000..7cc0b3f
--- /dev/null
+++ b/tools/kvm/x86/bios/int10.c
@@ -0,0 +1,110 @@
+#include "kvm/segment.h"
+#include "kvm/bios.h"
+#include "kvm/vesa.h"
+
+#include "bios/memcpy.h"
+
+#include <boot/vesa.h>
+
+static far_ptr gen_far_ptr(unsigned int pa)
+{
+       far_ptr ptr;
+
+       ptr.seg = (pa >> 4);
+       ptr.off = pa - (ptr.seg << 4);
+
+       return ptr;
+}
+
+static inline void outb(unsigned short port, unsigned char val)
+{
+       asm volatile("outb %0, %1" : : "a"(val), "Nd"(port));
+}
+
+/*
+ * It's probably much more useful to make this print to the serial
+ * line rather than print to a non-displayed VGA memory
+ */
+static inline void int10_putchar(struct biosregs *args)
+{
+       u8 al = args->eax & 0xFF;
+
+       outb(0x3f8, al);
+}
+
+static void vbe_get_mode(struct biosregs *args)
+{
+       struct vesa_mode_info *info = (struct vesa_mode_info *) args->edi;
+
+       *info = (struct vesa_mode_info) {
+               .mode_attr              = 0xd9, /* 11011011 */
+               .logical_scan           = VESA_WIDTH*4,
+               .h_res                  = VESA_WIDTH,
+               .v_res                  = VESA_HEIGHT,
+               .bpp                    = VESA_BPP,
+               .memory_layout          = 6,
+               .memory_planes          = 1,
+               .lfb_ptr                = VESA_MEM_ADDR,
+               .rmask                  = 8,
+               .gmask                  = 8,
+               .bmask                  = 8,
+               .resv_mask              = 8,
+               .resv_pos               = 24,
+               .bpos                   = 16,
+               .gpos                   = 8,
+       };
+}
+
+static void vbe_get_info(struct biosregs *args)
+{
+       struct vesa_general_info *infop = (struct vesa_general_info *) args->edi;
+       struct vesa_general_info info;
+
+       info = (struct vesa_general_info) {
+               .signature              = VESA_MAGIC,
+               .version                = 0x102,
+               .vendor_string          = gen_far_ptr(VGA_ROM_BEGIN),
+               .capabilities           = 0x10,
+               .video_mode_ptr         = gen_far_ptr(VGA_ROM_MODES),
+               .total_memory           = (4 * VESA_WIDTH * VESA_HEIGHT) / 0x10000,
+       };
+
+       memcpy16(args->es, infop, args->ds, &info, sizeof(info));
+}
+
+#define VBE_STATUS_OK          0x004F
+
+static void int10_vesa(struct biosregs *args)
+{
+       u8 al;
+
+       al = args->eax & 0xff;
+
+       switch (al) {
+       case 0x00:
+               vbe_get_info(args);
+               break;
+       case 0x01:
+               vbe_get_mode(args);
+               break;
+       }
+
+       args->eax = VBE_STATUS_OK;
+}
+
+bioscall void int10_handler(struct biosregs *args)
+{
+       u8 ah;
+
+       ah = (args->eax & 0xff00) >> 8;
+
+       switch (ah) {
+       case 0x0e:
+               int10_putchar(args);
+               break;
+       case 0x4f:
+               int10_vesa(args);
+               break;
+       }
+
+}
diff --git a/tools/kvm/x86/bios/int15.c b/tools/kvm/x86/bios/int15.c

new file mode 100644 (file)

index 0000000..faf5343
--- /dev/null
+++ b/tools/kvm/x86/bios/int15.c
@@ -0,0 +1,18 @@
+#include "kvm/bios.h"
+
+#include "kvm/e820.h"
+
+#include <asm/processor-flags.h>
+
+bioscall void int15_handler(struct biosregs *regs)
+{
+       switch (regs->eax) {
+       case 0xe820:
+               e820_query_map(regs);
+               break;
+       default:
+               /* Set CF to indicate failure.  */
+               regs->eflags    |= X86_EFLAGS_CF;
+               break;
+       }
+}
diff --git a/tools/kvm/x86/bios/local.S b/tools/kvm/x86/bios/local.S

new file mode 100644 (file)

index 0000000..f2cdbf4
--- /dev/null
+++ b/tools/kvm/x86/bios/local.S
@@ -0,0 +1,7 @@
+/*
+ * Local variables for almost every BIOS irq handler
+ * Must be put somewhere inside irq handler body
+ */
+__CALLER_SS:           .int  0
+__CALLER_SP:           .long 0
+__CALLER_CLOBBER:      .long 0
diff --git a/tools/kvm/x86/bios/macro.S b/tools/kvm/x86/bios/macro.S

new file mode 100644 (file)

index 0000000..0d5e567
--- /dev/null
+++ b/tools/kvm/x86/bios/macro.S
@@ -0,0 +1,25 @@
+/*
+ * handy BIOS macros
+ */
+
+/*
+ * switch to BIOS stack
+ */
+.macro stack_swap
+       movw %ss, %cs:(__CALLER_SS)
+       movl %esp, %cs:(__CALLER_SP)
+       movl %edx, %cs:(__CALLER_CLOBBER)
+       movw $MB_BIOS_SS, %dx
+       movw %dx, %ss
+       movw $MB_BIOS_SP, %sp
+       movl %cs:(__CALLER_CLOBBER), %edx
+.endm
+
+/*
+ * restore the original stack
+ */
+.macro stack_restore
+       movl %cs:(__CALLER_SP), %esp
+       movw %cs:(__CALLER_SS), %ss
+.endm
+
diff --git a/tools/kvm/x86/bios/memcpy.c b/tools/kvm/x86/bios/memcpy.c

new file mode 100644 (file)

index 0000000..40b9b65
--- /dev/null
+++ b/tools/kvm/x86/bios/memcpy.c
@@ -0,0 +1,23 @@
+#include "bios/memcpy.h"
+
+/*
+ *  Copy memory area in 16-bit real mode.
+ */
+void memcpy16(u16 dst_seg, void *dst, u16 src_seg, const void *src, size_t len)
+{
+       __asm__ __volatile__ (
+               "pushw  %%ds                            \n"
+               "pushw  %%es                            \n"
+               "movw   %[src_seg], %%ds                \n"
+               "movw   %[dst_seg], %%es                \n"
+               "rep movsb %%ds:(%%si), %%es:(%%di)     \n"
+               "popw   %%es                            \n"
+               "popw   %%ds                            \n"
+               :
+               : "S"(src),
+                 "D"(dst),
+                 "c"(len),
+                 [src_seg] "r"(src_seg),
+                 [dst_seg] "r"(dst_seg)
+               : "cc", "memory");
+}
diff --git a/tools/kvm/x86/bios/rom.ld.S b/tools/kvm/x86/bios/rom.ld.S

new file mode 100644 (file)

index 0000000..f4f1835
--- /dev/null
+++ b/tools/kvm/x86/bios/rom.ld.S
@@ -0,0 +1,16 @@
+OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_ARCH(i386)
+
+SECTIONS {
+       .text 0 : {
+               *(.text)
+       }
+
+       /DISCARD/ : {
+               *(.debug*)
+               *(.data)
+               *(.bss)
+               *(.eh_frame*)
+       }
+}
+
diff --git a/tools/kvm/x86/boot.c b/tools/kvm/x86/boot.c

new file mode 100644 (file)

index 0000000..93d9677
--- /dev/null
+++ b/tools/kvm/x86/boot.c
@@ -0,0 +1,41 @@
+#include "kvm/kvm.h"
+
+#include "kvm/util.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <fcntl.h>
+
+#define BIOS_SELECTOR  0xf000
+#define BIOS_IP                0xfff0
+#define BIOS_SP                0x8000
+
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename)
+{
+       struct stat st;
+       void *p;
+       int fd;
+       int nr;
+
+       fd = open(firmware_filename, O_RDONLY);
+       if (fd < 0)
+               return false;
+
+       if (fstat(fd, &st))
+               return false;
+
+       if (st.st_size > MB_FIRMWARE_BIOS_SIZE)
+               die("firmware image %s is too big to fit in memory (%Lu KB).\n", firmware_filename, (u64)(st.st_size / 1024));
+
+       p = guest_flat_to_host(kvm, MB_FIRMWARE_BIOS_BEGIN);
+
+       while ((nr = read(fd, p, st.st_size)) > 0)
+               p += nr;
+
+       kvm->boot_selector      = BIOS_SELECTOR;
+       kvm->boot_ip            = BIOS_IP;
+       kvm->boot_sp            = BIOS_SP;
+
+       return true;
+}
diff --git a/tools/kvm/x86/cpuid.c b/tools/kvm/x86/cpuid.c

new file mode 100644 (file)

index 0000000..4c140f0
--- /dev/null
+++ b/tools/kvm/x86/cpuid.c
@@ -0,0 +1,60 @@
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#include <sys/ioctl.h>
+#include <stdlib.h>
+
+#define CPUID_FUNC_PERFMON             0x0A
+
+#define        MAX_KVM_CPUID_ENTRIES           100
+
+static void filter_cpuid(struct kvm_cpuid2 *kvm_cpuid)
+{
+       unsigned int i;
+
+       /*
+        * Filter CPUID functions that are not supported by the hypervisor.
+        */
+       for (i = 0; i < kvm_cpuid->nent; i++) {
+               struct kvm_cpuid_entry2 *entry = &kvm_cpuid->entries[i];
+
+               switch (entry->function) {
+               case 1:
+                       /* Set X86_FEATURE_HYPERVISOR */
+                       if (entry->index == 0)
+                               entry->ecx |= (1 << 31);
+                       break;
+               case 6:
+                       /* Clear X86_FEATURE_EPB */
+                       entry->ecx = entry->ecx & ~(1 << 3);
+                       break;
+               case CPUID_FUNC_PERFMON:
+                       entry->eax = 0x00; /* disable it */
+                       break;
+               default:
+                       /* Keep the CPUID function as -is */
+                       break;
+               };
+       }
+}
+
+void kvm_cpu__setup_cpuid(struct kvm_cpu *vcpu)
+{
+       struct kvm_cpuid2 *kvm_cpuid;
+
+       kvm_cpuid = calloc(1, sizeof(*kvm_cpuid) +
+                               MAX_KVM_CPUID_ENTRIES * sizeof(*kvm_cpuid->entries));
+
+       kvm_cpuid->nent = MAX_KVM_CPUID_ENTRIES;
+       if (ioctl(vcpu->kvm->sys_fd, KVM_GET_SUPPORTED_CPUID, kvm_cpuid) < 0)
+               die_perror("KVM_GET_SUPPORTED_CPUID failed");
+
+       filter_cpuid(kvm_cpuid);
+
+       if (ioctl(vcpu->vcpu_fd, KVM_SET_CPUID2, kvm_cpuid) < 0)
+               die_perror("KVM_SET_CPUID2 failed");
+
+       free(kvm_cpuid);
+}
diff --git a/tools/kvm/x86/include/kvm/assembly.h b/tools/kvm/x86/include/kvm/assembly.h

new file mode 100644 (file)

index 0000000..e70baab
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/assembly.h
@@ -0,0 +1,24 @@
+#ifndef ASSEMBLY_H_
+#define ASSEMBLY_H_
+
+#define __ALIGN        .p2align 4, 0x90
+#define ENTRY(name)    \
+       __ALIGN;        \
+       .globl name;    \
+       name:
+
+#define GLOBAL(name)   \
+       .globl name;    \
+       name:
+
+#define ENTRY_END(name)        GLOBAL(name##_end)
+#define END(name)      GLOBAL(name##_end)
+
+/*
+ * gas produces size override prefix with which
+ * we are unhappy, lets make it hardcoded for
+ * 16 bit mode
+ */
+#define IRET   .byte 0xcf
+
+#endif /* ASSEMBLY_H_ */
diff --git a/tools/kvm/x86/include/kvm/barrier.h b/tools/kvm/x86/include/kvm/barrier.h

new file mode 100644 (file)

index 0000000..46d14f6
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/barrier.h
@@ -0,0 +1,20 @@
+#ifndef _KVM_BARRIER_H_
+#define _KVM_BARRIER_H_
+
+#define barrier() asm volatile("": : :"memory")
+
+#define mb()   asm volatile ("mfence": : :"memory")
+#define rmb()  asm volatile ("lfence": : :"memory")
+#define wmb()  asm volatile ("sfence": : :"memory")
+
+#ifdef CONFIG_SMP
+#define smp_mb()       mb()
+#define smp_rmb()      rmb()
+#define smp_wmb()      wmb()
+#else
+#define smp_mb()       barrier()
+#define smp_rmb()      barrier()
+#define smp_wmb()      barrier()
+#endif
+
+#endif /* _KVM_BARRIER_H_ */
diff --git a/tools/kvm/x86/include/kvm/bios-export.h b/tools/kvm/x86/include/kvm/bios-export.h

new file mode 100644 (file)

index 0000000..23825aa
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/bios-export.h
@@ -0,0 +1,13 @@
+#ifndef BIOS_EXPORT_H_
+#define BIOS_EXPORT_H_
+
+struct kvm;
+
+extern char bios_rom[0];
+extern char bios_rom_end[0];
+
+#define bios_rom_size          (bios_rom_end - bios_rom)
+
+extern void setup_bios(struct kvm *kvm);
+
+#endif /* BIOS_EXPORT_H_ */
diff --git a/tools/kvm/x86/include/kvm/bios.h b/tools/kvm/x86/include/kvm/bios.h

new file mode 100644 (file)

index 0000000..ec7ed71
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/bios.h
@@ -0,0 +1,93 @@
+#ifndef BIOS_H_
+#define BIOS_H_
+
+/*
+ * X86-32 Memory Map (typical)
+ *                                     start      end
+ * Real Mode Interrupt Vector Table    0x00000000 0x000003FF
+ * BDA area                            0x00000400 0x000004FF
+ * Conventional Low Memory             0x00000500 0x0009FBFF
+ * EBDA area                           0x0009FC00 0x0009FFFF
+ * VIDEO RAM                           0x000A0000 0x000BFFFF
+ * VIDEO ROM (BIOS)                    0x000C0000 0x000C7FFF
+ * ROMs & unus. space (mapped hw & misc)0x000C8000 0x000EFFFF 160 KiB (typically)
+ * Motherboard BIOS                    0x000F0000 0x000FFFFF
+ * Extended Memory                     0x00100000 0xFEBFFFFF
+ * Reserved (configs, ACPI, PnP, etc)  0xFEC00000 0xFFFFFFFF
+ */
+
+#define REAL_MODE_IVT_BEGIN            0x00000000
+#define REAL_MODE_IVT_END              0x000003ff
+
+#define BDA_START                      0x00000400
+#define BDA_END                                0x000004ff
+
+#define EBDA_START                     0x0009fc00
+#define EBDA_END                       0x0009ffff
+
+#define E820_MAP_START                 EBDA_START
+
+#define MB_BIOS_BEGIN                  0x000f0000
+#define MB_FIRMWARE_BIOS_BEGIN         0x000e0000
+#define MB_BIOS_END                    0x000fffff
+
+#define MB_BIOS_SIZE                   (MB_BIOS_END - MB_BIOS_BEGIN + 1)
+#define MB_FIRMWARE_BIOS_SIZE          (MB_BIOS_END - MB_FIRMWARE_BIOS_BEGIN + 1)
+
+#define VGA_RAM_BEGIN                  0x000a0000
+#define VGA_RAM_END                    0x000bffff
+
+#define VGA_ROM_BEGIN                  0x000c0000
+#define VGA_ROM_OEM_STRING             VGA_ROM_BEGIN
+#define VGA_ROM_OEM_STRING_SIZE                16
+#define VGA_ROM_MODES                  (VGA_ROM_OEM_STRING + VGA_ROM_OEM_STRING_SIZE)
+#define VGA_ROM_MODES_SIZE             32
+#define VGA_ROM_END                    0x000c7fff
+
+/* we handle one page only */
+#define VGA_RAM_SEG                    (VGA_RAM_BEGIN >> 4)
+#define VGA_PAGE_SIZE                  0x007d0 /* 80x25 */
+
+/* real mode interrupt vector table */
+#define REAL_INTR_BASE                 REAL_MODE_IVT_BEGIN
+#define REAL_INTR_VECTORS              256
+
+/*
+ * BIOS stack must be at absolute predefined memory address
+ * We reserve 64 bytes for BIOS stack
+ */
+#define MB_BIOS_SS                     0xfff7
+#define MB_BIOS_SP                     0x40
+
+/*
+ * When interfere with assembler code we need to be sure how
+ * arguments are passed in real mode.
+ */
+#define bioscall __attribute__((regparm(3)))
+
+#ifndef __ASSEMBLER__
+
+#include <linux/types.h>
+
+struct biosregs {
+       u32                     eax;
+       u32                     ebx;
+       u32                     ecx;
+       u32                     edx;
+       u32                     esp;
+       u32                     ebp;
+       u32                     esi;
+       u32                     edi;
+       u32                     ds;
+       u32                     es;
+       u32                     fs;
+       u32                     eip;
+       u32                     eflags;
+};
+
+extern bioscall void int10_handler(struct biosregs *regs);
+extern bioscall void int15_handler(struct biosregs *regs);
+
+#endif
+
+#endif /* BIOS_H_ */
diff --git a/tools/kvm/x86/include/kvm/boot-protocol.h b/tools/kvm/x86/include/kvm/boot-protocol.h

new file mode 100644 (file)

index 0000000..85b637f
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/boot-protocol.h
@@ -0,0 +1,16 @@
+/*
+ * Linux boot protocol specifics
+ */
+
+#ifndef BOOT_PROTOCOL_H_
+#define BOOT_PROTOCOL_H_
+
+/*
+ * The protected mode kernel part of a modern bzImage is loaded
+ * at 1 MB by default.
+ */
+#define BZ_DEFAULT_SETUP_SECTS         4
+#define BZ_KERNEL_START                        0x100000UL
+#define INITRD_START                   0x1000000UL
+
+#endif /* BOOT_PROTOCOL_H_ */
diff --git a/tools/kvm/x86/include/kvm/cpufeature.h b/tools/kvm/x86/include/kvm/cpufeature.h

new file mode 100644 (file)

index 0000000..bc4abbb
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/cpufeature.h
@@ -0,0 +1,41 @@
+#ifndef KVM__CPUFEATURE_H
+#define KVM__CPUFEATURE_H
+
+#define CPUID_VENDOR_INTEL_1 0x756e6547 /* "Genu" */
+#define CPUID_VENDOR_INTEL_2 0x49656e69 /* "ineI" */
+#define CPUID_VENDOR_INTEL_3 0x6c65746e /* "ntel" */
+
+#define CPUID_VENDOR_AMD_1   0x68747541 /* "Auth" */
+#define CPUID_VENDOR_AMD_2   0x69746e65 /* "enti" */
+#define CPUID_VENDOR_AMD_3   0x444d4163 /* "cAMD" */
+
+/*
+ * CPUID flags we need to deal with
+ */
+#define KVM__X86_FEATURE_VMX           5       /* Hardware virtualization */
+#define KVM__X86_FEATURE_SVM           2       /* Secure virtual machine */
+#define KVM__X86_FEATURE_XSAVE         26      /* XSAVE/XRSTOR/XSETBV/XGETBV */
+
+#define cpu_feature_disable(reg, feature)      \
+       ((reg) & ~(1 << (feature)))
+#define cpu_feature_enable(reg, feature)       \
+       ((reg) |  (1 << (feature)))
+
+struct cpuid_regs {
+       u32     eax;
+       u32     ebx;
+       u32     ecx;
+       u32     edx;
+};
+
+static inline void host_cpuid(struct cpuid_regs *regs)
+{
+       asm volatile("cpuid"
+               : "=a" (regs->eax),
+                 "=b" (regs->ebx),
+                 "=c" (regs->ecx),
+                 "=d" (regs->edx)
+               : "0" (regs->eax), "2" (regs->ecx));
+}
+
+#endif /* KVM__CPUFEATURE_H */
diff --git a/tools/kvm/x86/include/kvm/interrupt.h b/tools/kvm/x86/include/kvm/interrupt.h

new file mode 100644 (file)

index 0000000..00c7ed7
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/interrupt.h
@@ -0,0 +1,26 @@
+#ifndef KVM__INTERRUPT_H
+#define KVM__INTERRUPT_H
+
+#include <linux/types.h>
+#include "kvm/bios.h"
+#include "kvm/bios-export.h"
+
+struct real_intr_desc {
+       u16 offset;
+       u16 segment;
+} __attribute__((packed));
+
+#define REAL_SEGMENT_SHIFT     4
+#define REAL_SEGMENT(addr)     ((addr) >> REAL_SEGMENT_SHIFT)
+#define REAL_OFFSET(addr)      ((addr) & ((1 << REAL_SEGMENT_SHIFT) - 1))
+#define REAL_INTR_SIZE         (REAL_INTR_VECTORS * sizeof(struct real_intr_desc))
+
+struct interrupt_table {
+       struct real_intr_desc entries[REAL_INTR_VECTORS];
+};
+
+void interrupt_table__copy(struct interrupt_table *itable, void *dst, unsigned int size);
+void interrupt_table__setup(struct interrupt_table *itable, struct real_intr_desc *entry);
+void interrupt_table__set(struct interrupt_table *itable, struct real_intr_desc *entry, unsigned int num);
+
+#endif /* KVM__INTERRUPT_H */
diff --git a/tools/kvm/x86/include/kvm/kvm-arch.h b/tools/kvm/x86/include/kvm/kvm-arch.h

new file mode 100644 (file)

index 0000000..dd385d4
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/kvm-arch.h
@@ -0,0 +1,67 @@
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+#include "kvm/interrupt.h"
+#include "kvm/segment.h"
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <time.h>
+
+/*
+ * The hole includes VESA framebuffer and PCI memory.
+ */
+#define KVM_32BIT_MAX_MEM_SIZE  (1ULL << 32)
+#define KVM_32BIT_GAP_SIZE     (768 << 20)
+#define KVM_32BIT_GAP_START    (KVM_32BIT_MAX_MEM_SIZE - KVM_32BIT_GAP_SIZE)
+
+#define KVM_MMIO_START         KVM_32BIT_GAP_START
+
+/* This is the address that pci_get_io_space_block() starts allocating
+ * from.  Note that this is a PCI bus address (though same on x86).
+ */
+#define KVM_PCI_MMIO_AREA      (KVM_MMIO_START + 0x1000000)
+#define KVM_VIRTIO_MMIO_AREA   (KVM_MMIO_START + 0x2000000)
+
+struct kvm {
+       int                     sys_fd;         /* For system ioctls(), i.e. /dev/kvm */
+       int                     vm_fd;          /* For VM ioctls() */
+       timer_t                 timerid;        /* Posix timer for interrupts */
+
+       int                     nrcpus;         /* Number of cpus to run */
+
+       u32                     mem_slots;      /* for KVM_SET_USER_MEMORY_REGION */
+
+       u64                     ram_size;
+       void                    *ram_start;
+       u64                     ram_pagesize;
+
+       bool                    nmi_disabled;
+
+       bool                    single_step;
+
+       u16                     boot_selector;
+       u16                     boot_ip;
+       u16                     boot_sp;
+
+       struct interrupt_table  interrupt_table;
+
+       const char              *vmlinux;
+       struct disk_image       **disks;
+       int                     nr_disks;
+
+       char                    *name;
+
+       int                     vm_state;
+};
+
+static inline void *guest_flat_to_host(struct kvm *kvm, unsigned long offset); /* In kvm.h */
+
+static inline void *guest_real_to_host(struct kvm *kvm, u16 selector, u16 offset)
+{
+       unsigned long flat = segment_to_flat(selector, offset);
+
+       return guest_flat_to_host(kvm, flat);
+}
+
+#endif /* KVM__KVM_ARCH_H */
diff --git a/tools/kvm/x86/include/kvm/kvm-cpu-arch.h b/tools/kvm/x86/include/kvm/kvm-cpu-arch.h

new file mode 100644 (file)

index 0000000..198efe6
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/kvm-cpu-arch.h
@@ -0,0 +1,49 @@
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+/* Architecture-specific kvm_cpu definitions. */
+
+#include <linux/kvm.h> /* for struct kvm_regs */
+#include "kvm/kvm.h"   /* for kvm__emulate_{mm}io() */
+#include <stdbool.h>
+#include <pthread.h>
+
+struct kvm;
+
+struct kvm_cpu {
+       pthread_t               thread;         /* VCPU thread */
+
+       unsigned long           cpu_id;
+
+       struct kvm              *kvm;           /* parent KVM */
+       int                     vcpu_fd;        /* For VCPU ioctls() */
+       struct kvm_run          *kvm_run;
+
+       struct kvm_regs         regs;
+       struct kvm_sregs        sregs;
+       struct kvm_fpu          fpu;
+
+       struct kvm_msrs         *msrs;          /* dynamically allocated */
+
+       u8                      is_running;
+       u8                      paused;
+       u8                      needs_nmi;
+
+       struct kvm_coalesced_mmio_ring  *ring;
+};
+
+/*
+ * As these are such simple wrappers, let's have them in the header so they'll
+ * be cheaper to call:
+ */
+static inline bool kvm_cpu__emulate_io(struct kvm *kvm, u16 port, void *data, int direction, int size, u32 count)
+{
+       return kvm__emulate_io(kvm, port, data, direction, size, count);
+}
+
+static inline bool kvm_cpu__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+       return kvm__emulate_mmio(kvm, phys_addr, data, len, is_write);
+}
+
+#endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/tools/kvm/x86/include/kvm/mptable.h b/tools/kvm/x86/include/kvm/mptable.h

new file mode 100644 (file)

index 0000000..9e3cfa6
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/mptable.h
@@ -0,0 +1,9 @@
+#ifndef KVM_MPTABLE_H_
+#define KVM_MPTABLE_H_
+
+struct kvm;
+
+int mptable__init(struct kvm *kvm);
+int mptable__exit(struct kvm *kvm);
+
+#endif /* KVM_MPTABLE_H_ */
diff --git a/tools/kvm/x86/interrupt.c b/tools/kvm/x86/interrupt.c

new file mode 100644 (file)

index 0000000..7d47869
--- /dev/null
+++ b/tools/kvm/x86/interrupt.c
@@ -0,0 +1,28 @@
+#include "kvm/interrupt.h"
+
+#include "kvm/util.h"
+
+#include <string.h>
+
+void interrupt_table__copy(struct interrupt_table *itable, void *dst, unsigned int size)
+{
+       if (size < sizeof(itable->entries))
+               die("An attempt to overwrite host memory");
+
+       memcpy(dst, itable->entries, sizeof(itable->entries));
+}
+
+void interrupt_table__setup(struct interrupt_table *itable, struct real_intr_desc *entry)
+{
+       unsigned int i;
+
+       for (i = 0; i < REAL_INTR_VECTORS; i++)
+               itable->entries[i] = *entry;
+}
+
+void interrupt_table__set(struct interrupt_table *itable,
+                               struct real_intr_desc *entry, unsigned int num)
+{
+       if (num < REAL_INTR_VECTORS)
+               itable->entries[num] = *entry;
+}
diff --git a/tools/kvm/x86/ioport.c b/tools/kvm/x86/ioport.c

new file mode 100644 (file)

index 0000000..86302e6
--- /dev/null
+++ b/tools/kvm/x86/ioport.c
@@ -0,0 +1,77 @@
+#include "kvm/ioport.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+
+static bool debug_io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       exit(EXIT_SUCCESS);
+}
+
+static struct ioport_operations debug_ops = {
+       .io_out         = debug_io_out,
+};
+
+static bool seabios_debug_io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       char ch;
+
+       ch = ioport__read8(data);
+
+       putchar(ch);
+
+       return true;
+}
+
+static struct ioport_operations seabios_debug_ops = {
+       .io_out         = seabios_debug_io_out,
+};
+
+static bool dummy_io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       return true;
+}
+
+static bool dummy_io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       return true;
+}
+
+static struct ioport_operations dummy_read_write_ioport_ops = {
+       .io_in          = dummy_io_in,
+       .io_out         = dummy_io_out,
+};
+
+static struct ioport_operations dummy_write_only_ioport_ops = {
+       .io_out         = dummy_io_out,
+};
+
+void ioport__setup_arch(void)
+{
+       /* Legacy ioport setup */
+
+       /* 0x0020 - 0x003F - 8259A PIC 1 */
+       ioport__register(0x0020, &dummy_read_write_ioport_ops, 2, NULL);
+
+       /* PORT 0040-005F - PIT - PROGRAMMABLE INTERVAL TIMER (8253, 8254) */
+       ioport__register(0x0040, &dummy_read_write_ioport_ops, 4, NULL);
+
+       /* 0x00A0 - 0x00AF - 8259A PIC 2 */
+       ioport__register(0x00A0, &dummy_read_write_ioport_ops, 2, NULL);
+
+       /* PORT 00E0-00EF are 'motherboard specific' so we use them for our
+          internal debugging purposes.  */
+       ioport__register(IOPORT_DBG, &debug_ops, 1, NULL);
+
+       /* PORT 00ED - DUMMY PORT FOR DELAY??? */
+       ioport__register(0x00ED, &dummy_write_only_ioport_ops, 1, NULL);
+
+       /* 0x00F0 - 0x00FF - Math co-processor */
+       ioport__register(0x00F0, &dummy_write_only_ioport_ops, 2, NULL);
+
+       /* PORT 03D4-03D5 - COLOR VIDEO - CRT CONTROL REGISTERS */
+       ioport__register(0x03D4, &dummy_read_write_ioport_ops, 1, NULL);
+       ioport__register(0x03D5, &dummy_write_only_ioport_ops, 1, NULL);
+
+       ioport__register(0x402, &seabios_debug_ops, 1, NULL);
+}
diff --git a/tools/kvm/x86/irq.c b/tools/kvm/x86/irq.c

new file mode 100644 (file)

index 0000000..e83df99
--- /dev/null
+++ b/tools/kvm/x86/irq.c
@@ -0,0 +1,222 @@
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/kvm.h>
+#include <sys/ioctl.h>
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#define IRQ_MAX_GSI                    64
+#define IRQCHIP_MASTER                 0
+#define IRQCHIP_SLAVE                  1
+#define IRQCHIP_IOAPIC                 2
+
+static u8              next_line       = 5;
+static u8              next_dev        = 1;
+static struct rb_root  pci_tree        = RB_ROOT;
+
+/* First 24 GSIs are routed between IRQCHIPs and IOAPICs */
+static u32 gsi = 24;
+
+struct kvm_irq_routing *irq_routing;
+
+static int irq__add_routing(u32 gsi, u32 type, u32 irqchip, u32 pin)
+{
+       if (gsi >= IRQ_MAX_GSI)
+               return -ENOSPC;
+
+       irq_routing->entries[irq_routing->nr++] =
+               (struct kvm_irq_routing_entry) {
+                       .gsi = gsi,
+                       .type = type,
+                       .u.irqchip.irqchip = irqchip,
+                       .u.irqchip.pin = pin,
+               };
+
+       return 0;
+}
+
+static struct pci_dev *search(struct rb_root *root, u32 id)
+{
+       struct rb_node *node = root->rb_node;
+
+       while (node) {
+               struct pci_dev *data = rb_entry(node, struct pci_dev, node);
+               int result;
+
+               result = id - data->id;
+
+               if (result < 0)
+                       node = node->rb_left;
+               else if (result > 0)
+                       node = node->rb_right;
+               else
+                       return data;
+       }
+       return NULL;
+}
+
+static int insert(struct rb_root *root, struct pci_dev *data)
+{
+       struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+       /* Figure out where to put new node */
+       while (*new) {
+               struct pci_dev *this    = container_of(*new, struct pci_dev, node);
+               int result              = data->id - this->id;
+
+               parent = *new;
+               if (result < 0)
+                       new = &((*new)->rb_left);
+               else if (result > 0)
+                       new = &((*new)->rb_right);
+               else
+                       return -EEXIST;
+       }
+
+       /* Add new node and rebalance tree. */
+       rb_link_node(&data->node, parent, new);
+       rb_insert_color(&data->node, root);
+
+       return 0;
+}
+
+int irq__register_device(u32 dev, u8 *num, u8 *pin, u8 *line)
+{
+       struct pci_dev *node;
+       int r;
+
+       node = search(&pci_tree, dev);
+
+       if (!node) {
+               /* We haven't found a node - First device of it's kind */
+               node = malloc(sizeof(*node));
+               if (node == NULL)
+                       return -ENOMEM;
+
+               *node = (struct pci_dev) {
+                       .id     = dev,
+                       /*
+                        * PCI supports only INTA#,B#,C#,D# per device.
+                        * A#,B#,C#,D# are allowed for multifunctional
+                        * devices so stick with A# for our single
+                        * function devices.
+                        */
+                       .pin    = 1,
+               };
+
+               INIT_LIST_HEAD(&node->lines);
+
+               r = insert(&pci_tree, node);
+               if (r) {
+                       free(node);
+                       return r;
+               }
+       }
+
+       if (node) {
+               /* This device already has a pin assigned, give out a new line and device id */
+               struct irq_line *new = malloc(sizeof(*new));
+               if (new == NULL)
+                       return -ENOMEM;
+
+               new->line       = next_line++;
+               *line           = new->line;
+               *pin            = node->pin;
+               *num            = next_dev++;
+
+               list_add(&new->node, &node->lines);
+
+               return 0;
+       }
+
+       return -EFAULT;
+}
+
+int irq__init(struct kvm *kvm)
+{
+       int i, r;
+
+       irq_routing = calloc(sizeof(struct kvm_irq_routing) +
+                       IRQ_MAX_GSI * sizeof(struct kvm_irq_routing_entry), 1);
+       if (irq_routing == NULL)
+               return -ENOMEM;
+
+       /* Hook first 8 GSIs to master IRQCHIP */
+       for (i = 0; i < 8; i++)
+               if (i != 2)
+                       irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_MASTER, i);
+
+       /* Hook next 8 GSIs to slave IRQCHIP */
+       for (i = 8; i < 16; i++)
+               irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_SLAVE, i - 8);
+
+       /* Last but not least, IOAPIC */
+       for (i = 0; i < 24; i++) {
+               if (i == 0)
+                       irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_IOAPIC, 2);
+               else if (i != 2)
+                       irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_IOAPIC, i);
+       }
+
+       r = ioctl(kvm->vm_fd, KVM_SET_GSI_ROUTING, irq_routing);
+       if (r) {
+               free(irq_routing);
+               return errno;
+       }
+
+       return 0;
+}
+
+int irq__exit(struct kvm *kvm)
+{
+       struct rb_node *ent;
+
+       free(irq_routing);
+
+       while ((ent = rb_first(&pci_tree))) {
+               struct pci_dev *dev;
+               struct irq_line *line;
+
+               dev = rb_entry(ent, struct pci_dev, node);
+               while (!list_empty(&dev->lines)) {
+                       line = list_first_entry(&dev->lines, struct irq_line, node);
+                       list_del(&line->node);
+                       free(line);
+               }
+               rb_erase(&dev->node, &pci_tree);
+               free(dev);
+       }
+
+       return 0;
+}
+
+int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg)
+{
+       int r;
+
+       irq_routing->entries[irq_routing->nr++] =
+               (struct kvm_irq_routing_entry) {
+                       .gsi = gsi,
+                       .type = KVM_IRQ_ROUTING_MSI,
+                       .u.msi.address_hi = msg->address_hi,
+                       .u.msi.address_lo = msg->address_lo,
+                       .u.msi.data = msg->data,
+               };
+
+       r = ioctl(kvm->vm_fd, KVM_SET_GSI_ROUTING, irq_routing);
+       if (r)
+               return r;
+
+       return gsi++;
+}
+
+struct rb_node *irq__get_pci_tree(void)
+{
+       return rb_first(&pci_tree);
+}
diff --git a/tools/kvm/x86/kvm-cpu.c b/tools/kvm/x86/kvm-cpu.c

new file mode 100644 (file)

index 0000000..2b3d973
--- /dev/null
+++ b/tools/kvm/x86/kvm-cpu.c
@@ -0,0 +1,425 @@
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/symbol.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include <asm/msr-index.h>
+#include <asm/apicdef.h>
+#include <linux/err.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+
+static int debug_fd;
+
+void kvm_cpu__set_debug_fd(int fd)
+{
+       debug_fd = fd;
+}
+
+int kvm_cpu__get_debug_fd(void)
+{
+       return debug_fd;
+}
+
+static inline bool is_in_protected_mode(struct kvm_cpu *vcpu)
+{
+       return vcpu->sregs.cr0 & 0x01;
+}
+
+static inline u64 ip_to_flat(struct kvm_cpu *vcpu, u64 ip)
+{
+       u64 cs;
+
+       /*
+        * NOTE! We should take code segment base address into account here.
+        * Luckily it's usually zero because Linux uses flat memory model.
+        */
+       if (is_in_protected_mode(vcpu))
+               return ip;
+
+       cs = vcpu->sregs.cs.selector;
+
+       return ip + (cs << 4);
+}
+
+static inline u32 selector_to_base(u16 selector)
+{
+       /*
+        * KVM on Intel requires 'base' to be 'selector * 16' in real mode.
+        */
+       return (u32)selector << 4;
+}
+
+static struct kvm_cpu *kvm_cpu__new(struct kvm *kvm)
+{
+       struct kvm_cpu *vcpu;
+
+       vcpu = calloc(1, sizeof(*vcpu));
+       if (!vcpu)
+               return NULL;
+
+       vcpu->kvm = kvm;
+
+       return vcpu;
+}
+
+void kvm_cpu__delete(struct kvm_cpu *vcpu)
+{
+       if (vcpu->msrs)
+               free(vcpu->msrs);
+
+       free(vcpu);
+}
+
+static int kvm_cpu__set_lint(struct kvm_cpu *vcpu)
+{
+       struct local_apic lapic;
+
+       if (ioctl(vcpu->vcpu_fd, KVM_GET_LAPIC, &lapic))
+               return -1;
+
+       lapic.lvt_lint0.delivery_mode = APIC_MODE_EXTINT;
+       lapic.lvt_lint1.delivery_mode = APIC_MODE_NMI;
+
+       return ioctl(vcpu->vcpu_fd, KVM_SET_LAPIC, &lapic);
+}
+
+struct kvm_cpu *kvm_cpu__init(struct kvm *kvm, unsigned long cpu_id)
+{
+       struct kvm_cpu *vcpu;
+       int mmap_size;
+       int coalesced_offset;
+
+       vcpu = kvm_cpu__new(kvm);
+       if (!vcpu)
+               return NULL;
+
+       vcpu->cpu_id = cpu_id;
+
+       vcpu->vcpu_fd = ioctl(vcpu->kvm->vm_fd, KVM_CREATE_VCPU, cpu_id);
+       if (vcpu->vcpu_fd < 0)
+               die_perror("KVM_CREATE_VCPU ioctl");
+
+       mmap_size = ioctl(vcpu->kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
+       if (mmap_size < 0)
+               die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl");
+
+       vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED, vcpu->vcpu_fd, 0);
+       if (vcpu->kvm_run == MAP_FAILED)
+               die("unable to mmap vcpu fd");
+
+       coalesced_offset = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_COALESCED_MMIO);
+       if (coalesced_offset)
+               vcpu->ring = (void *)vcpu->kvm_run + (coalesced_offset * PAGE_SIZE);
+
+       if (kvm_cpu__set_lint(vcpu))
+               die_perror("KVM_SET_LAPIC failed");
+
+       vcpu->is_running = true;
+
+       return vcpu;
+}
+
+static struct kvm_msrs *kvm_msrs__new(size_t nmsrs)
+{
+       struct kvm_msrs *vcpu = calloc(1, sizeof(*vcpu) + (sizeof(struct kvm_msr_entry) * nmsrs));
+
+       if (!vcpu)
+               die("out of memory");
+
+       return vcpu;
+}
+
+#define KVM_MSR_ENTRY(_index, _data)   \
+       (struct kvm_msr_entry) { .index = _index, .data = _data }
+
+static void kvm_cpu__setup_msrs(struct kvm_cpu *vcpu)
+{
+       unsigned long ndx = 0;
+
+       vcpu->msrs = kvm_msrs__new(100);
+
+       vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_CS,        0x0);
+       vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_ESP,       0x0);
+       vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_EIP,       0x0);
+#ifdef CONFIG_X86_64
+       vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_STAR,                    0x0);
+       vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_CSTAR,                   0x0);
+       vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_KERNEL_GS_BASE,          0x0);
+       vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_SYSCALL_MASK,            0x0);
+       vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_LSTAR,                   0x0);
+#endif
+       vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_TSC,                0x0);
+       vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_MISC_ENABLE,
+                                               MSR_IA32_MISC_ENABLE_FAST_STRING);
+
+       vcpu->msrs->nmsrs = ndx;
+
+       if (ioctl(vcpu->vcpu_fd, KVM_SET_MSRS, vcpu->msrs) < 0)
+               die_perror("KVM_SET_MSRS failed");
+}
+
+static void kvm_cpu__setup_fpu(struct kvm_cpu *vcpu)
+{
+       vcpu->fpu = (struct kvm_fpu) {
+               .fcw    = 0x37f,
+               .mxcsr  = 0x1f80,
+       };
+
+       if (ioctl(vcpu->vcpu_fd, KVM_SET_FPU, &vcpu->fpu) < 0)
+               die_perror("KVM_SET_FPU failed");
+}
+
+static void kvm_cpu__setup_regs(struct kvm_cpu *vcpu)
+{
+       vcpu->regs = (struct kvm_regs) {
+               /* We start the guest in 16-bit real mode  */
+               .rflags = 0x0000000000000002ULL,
+
+               .rip    = vcpu->kvm->boot_ip,
+               .rsp    = vcpu->kvm->boot_sp,
+               .rbp    = vcpu->kvm->boot_sp,
+       };
+
+       if (vcpu->regs.rip > USHRT_MAX)
+               die("ip 0x%llx is too high for real mode", (u64)vcpu->regs.rip);
+
+       if (ioctl(vcpu->vcpu_fd, KVM_SET_REGS, &vcpu->regs) < 0)
+               die_perror("KVM_SET_REGS failed");
+}
+
+static void kvm_cpu__setup_sregs(struct kvm_cpu *vcpu)
+{
+       if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0)
+               die_perror("KVM_GET_SREGS failed");
+
+       vcpu->sregs.cs.selector = vcpu->kvm->boot_selector;
+       vcpu->sregs.cs.base     = selector_to_base(vcpu->kvm->boot_selector);
+       vcpu->sregs.ss.selector = vcpu->kvm->boot_selector;
+       vcpu->sregs.ss.base     = selector_to_base(vcpu->kvm->boot_selector);
+       vcpu->sregs.ds.selector = vcpu->kvm->boot_selector;
+       vcpu->sregs.ds.base     = selector_to_base(vcpu->kvm->boot_selector);
+       vcpu->sregs.es.selector = vcpu->kvm->boot_selector;
+       vcpu->sregs.es.base     = selector_to_base(vcpu->kvm->boot_selector);
+       vcpu->sregs.fs.selector = vcpu->kvm->boot_selector;
+       vcpu->sregs.fs.base     = selector_to_base(vcpu->kvm->boot_selector);
+       vcpu->sregs.gs.selector = vcpu->kvm->boot_selector;
+       vcpu->sregs.gs.base     = selector_to_base(vcpu->kvm->boot_selector);
+
+       if (ioctl(vcpu->vcpu_fd, KVM_SET_SREGS, &vcpu->sregs) < 0)
+               die_perror("KVM_SET_SREGS failed");
+}
+
+/**
+ * kvm_cpu__reset_vcpu - reset virtual CPU to a known state
+ */
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+       kvm_cpu__setup_cpuid(vcpu);
+       kvm_cpu__setup_sregs(vcpu);
+       kvm_cpu__setup_regs(vcpu);
+       kvm_cpu__setup_fpu(vcpu);
+       kvm_cpu__setup_msrs(vcpu);
+}
+
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
+{
+       return false;
+}
+
+static void print_dtable(const char *name, struct kvm_dtable *dtable)
+{
+       dprintf(debug_fd, " %s                 %016llx  %08hx\n",
+               name, (u64) dtable->base, (u16) dtable->limit);
+}
+
+static void print_segment(const char *name, struct kvm_segment *seg)
+{
+       dprintf(debug_fd, " %s       %04hx      %016llx  %08x  %02hhx    %x %x   %x  %x %x %x %x\n",
+               name, (u16) seg->selector, (u64) seg->base, (u32) seg->limit,
+               (u8) seg->type, seg->present, seg->dpl, seg->db, seg->s, seg->l, seg->g, seg->avl);
+}
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+       unsigned long cr0, cr2, cr3;
+       unsigned long cr4, cr8;
+       unsigned long rax, rbx, rcx;
+       unsigned long rdx, rsi, rdi;
+       unsigned long rbp,  r8,  r9;
+       unsigned long r10, r11, r12;
+       unsigned long r13, r14, r15;
+       unsigned long rip, rsp;
+       struct kvm_sregs sregs;
+       unsigned long rflags;
+       struct kvm_regs regs;
+       int i;
+
+       if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &regs) < 0)
+               die("KVM_GET_REGS failed");
+
+       rflags = regs.rflags;
+
+       rip = regs.rip; rsp = regs.rsp;
+       rax = regs.rax; rbx = regs.rbx; rcx = regs.rcx;
+       rdx = regs.rdx; rsi = regs.rsi; rdi = regs.rdi;
+       rbp = regs.rbp; r8  = regs.r8;  r9  = regs.r9;
+       r10 = regs.r10; r11 = regs.r11; r12 = regs.r12;
+       r13 = regs.r13; r14 = regs.r14; r15 = regs.r15;
+
+       dprintf(debug_fd, "\n Registers:\n");
+       dprintf(debug_fd,   " ----------\n");
+       dprintf(debug_fd, " rip: %016lx   rsp: %016lx flags: %016lx\n", rip, rsp, rflags);
+       dprintf(debug_fd, " rax: %016lx   rbx: %016lx   rcx: %016lx\n", rax, rbx, rcx);
+       dprintf(debug_fd, " rdx: %016lx   rsi: %016lx   rdi: %016lx\n", rdx, rsi, rdi);
+       dprintf(debug_fd, " rbp: %016lx    r8: %016lx    r9: %016lx\n", rbp, r8,  r9);
+       dprintf(debug_fd, " r10: %016lx   r11: %016lx   r12: %016lx\n", r10, r11, r12);
+       dprintf(debug_fd, " r13: %016lx   r14: %016lx   r15: %016lx\n", r13, r14, r15);
+
+       if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &sregs) < 0)
+               die("KVM_GET_REGS failed");
+
+       cr0 = sregs.cr0; cr2 = sregs.cr2; cr3 = sregs.cr3;
+       cr4 = sregs.cr4; cr8 = sregs.cr8;
+
+       dprintf(debug_fd, " cr0: %016lx   cr2: %016lx   cr3: %016lx\n", cr0, cr2, cr3);
+       dprintf(debug_fd, " cr4: %016lx   cr8: %016lx\n", cr4, cr8);
+       dprintf(debug_fd, "\n Segment registers:\n");
+       dprintf(debug_fd,   " ------------------\n");
+       dprintf(debug_fd, " register  selector  base              limit     type  p dpl db s l g avl\n");
+       print_segment("cs ", &sregs.cs);
+       print_segment("ss ", &sregs.ss);
+       print_segment("ds ", &sregs.ds);
+       print_segment("es ", &sregs.es);
+       print_segment("fs ", &sregs.fs);
+       print_segment("gs ", &sregs.gs);
+       print_segment("tr ", &sregs.tr);
+       print_segment("ldt", &sregs.ldt);
+       print_dtable("gdt", &sregs.gdt);
+       print_dtable("idt", &sregs.idt);
+
+       dprintf(debug_fd, "\n APIC:\n");
+       dprintf(debug_fd,   " -----\n");
+       dprintf(debug_fd, " efer: %016llx  apic base: %016llx  nmi: %s\n",
+               (u64) sregs.efer, (u64) sregs.apic_base,
+               (vcpu->kvm->nmi_disabled ? "disabled" : "enabled"));
+
+       dprintf(debug_fd, "\n Interrupt bitmap:\n");
+       dprintf(debug_fd,   " -----------------\n");
+       for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++)
+               dprintf(debug_fd, " %016llx", (u64) sregs.interrupt_bitmap[i]);
+       dprintf(debug_fd, "\n");
+}
+
+#define MAX_SYM_LEN 128
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+       unsigned int code_bytes = 64;
+       unsigned int code_prologue = 43;
+       unsigned int code_len = code_bytes;
+       char sym[MAX_SYM_LEN] = SYMBOL_DEFAULT_UNKNOWN, *psym;
+       unsigned char c;
+       unsigned int i;
+       u8 *ip;
+
+       if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &vcpu->regs) < 0)
+               die("KVM_GET_REGS failed");
+
+       if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0)
+               die("KVM_GET_SREGS failed");
+
+       ip = guest_flat_to_host(vcpu->kvm, ip_to_flat(vcpu, vcpu->regs.rip) - code_prologue);
+
+       dprintf(debug_fd, "\n Code:\n");
+       dprintf(debug_fd,   " -----\n");
+
+       psym = symbol_lookup(vcpu->kvm, vcpu->regs.rip, sym, MAX_SYM_LEN);
+       if (IS_ERR(psym))
+               dprintf(debug_fd,
+                       "Warning: symbol_lookup() failed to find symbol "
+                       "with error: %ld\n", PTR_ERR(psym));
+
+       dprintf(debug_fd, " rip: [<%016lx>] %s\n\n", (unsigned long) vcpu->regs.rip, sym);
+
+       for (i = 0; i < code_len; i++, ip++) {
+               if (!host_ptr_in_ram(vcpu->kvm, ip))
+                       break;
+
+               c = *ip;
+
+               if (ip == guest_flat_to_host(vcpu->kvm, ip_to_flat(vcpu, vcpu->regs.rip)))
+                       dprintf(debug_fd, " <%02x>", c);
+               else
+                       dprintf(debug_fd, " %02x", c);
+       }
+
+       dprintf(debug_fd, "\n");
+
+       dprintf(debug_fd, "\n Stack:\n");
+       dprintf(debug_fd,   " ------\n");
+       kvm__dump_mem(vcpu->kvm, vcpu->regs.rsp, 32);
+}
+
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
+{
+       u64 *pte1;
+       u64 *pte2;
+       u64 *pte3;
+       u64 *pte4;
+
+       if (!is_in_protected_mode(vcpu))
+               return;
+
+       if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0)
+               die("KVM_GET_SREGS failed");
+
+       pte4 = guest_flat_to_host(vcpu->kvm, vcpu->sregs.cr3);
+       if (!host_ptr_in_ram(vcpu->kvm, pte4))
+               return;
+
+       pte3 = guest_flat_to_host(vcpu->kvm, (*pte4 & ~0xfff));
+       if (!host_ptr_in_ram(vcpu->kvm, pte3))
+               return;
+
+       pte2 = guest_flat_to_host(vcpu->kvm, (*pte3 & ~0xfff));
+       if (!host_ptr_in_ram(vcpu->kvm, pte2))
+               return;
+
+       pte1 = guest_flat_to_host(vcpu->kvm, (*pte2 & ~0xfff));
+       if (!host_ptr_in_ram(vcpu->kvm, pte1))
+               return;
+
+       dprintf(debug_fd, "Page Tables:\n");
+       if (*pte2 & (1 << 7))
+               dprintf(debug_fd, " pte4: %016llx   pte3: %016llx"
+                       "   pte2: %016llx\n",
+                       *pte4, *pte3, *pte2);
+       else
+               dprintf(debug_fd, " pte4: %016llx  pte3: %016llx   pte2: %016"
+                       "llx   pte1: %016llx\n",
+                       *pte4, *pte3, *pte2, *pte1);
+}
+
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu)
+{
+       struct kvm_lapic_state klapic;
+       struct local_apic *lapic = (void *)&klapic;
+
+       if (ioctl(cpu->vcpu_fd, KVM_GET_LAPIC, &klapic) != 0)
+               return;
+
+       if (lapic->lvt_lint1.mask)
+               return;
+
+       if (lapic->lvt_lint1.delivery_mode != APIC_MODE_NMI)
+               return;
+
+       ioctl(cpu->vcpu_fd, KVM_NMI);
+}
diff --git a/tools/kvm/x86/kvm.c b/tools/kvm/x86/kvm.c

new file mode 100644 (file)

index 0000000..0a40fd5
--- /dev/null
+++ b/tools/kvm/x86/kvm.c
@@ -0,0 +1,372 @@
+#include "kvm/kvm.h"
+#include "kvm/boot-protocol.h"
+#include "kvm/cpufeature.h"
+#include "kvm/interrupt.h"
+#include "kvm/mptable.h"
+#include "kvm/util.h"
+#include "kvm/8250-serial.h"
+#include "kvm/virtio-console.h"
+
+#include <asm/bootparam.h>
+#include <linux/kvm.h>
+
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <fcntl.h>
+
+struct kvm_ext kvm_req_ext[] = {
+       { DEFINE_KVM_EXT(KVM_CAP_COALESCED_MMIO) },
+       { DEFINE_KVM_EXT(KVM_CAP_SET_TSS_ADDR) },
+       { DEFINE_KVM_EXT(KVM_CAP_PIT2) },
+       { DEFINE_KVM_EXT(KVM_CAP_USER_MEMORY) },
+       { DEFINE_KVM_EXT(KVM_CAP_IRQ_ROUTING) },
+       { DEFINE_KVM_EXT(KVM_CAP_IRQCHIP) },
+       { DEFINE_KVM_EXT(KVM_CAP_HLT) },
+       { DEFINE_KVM_EXT(KVM_CAP_IRQ_INJECT_STATUS) },
+       { DEFINE_KVM_EXT(KVM_CAP_EXT_CPUID) },
+       { 0, 0 }
+};
+
+bool kvm__arch_cpu_supports_vm(void)
+{
+       struct cpuid_regs regs;
+       u32 eax_base;
+       int feature;
+
+       regs    = (struct cpuid_regs) {
+               .eax            = 0x00,
+       };
+       host_cpuid(&regs);
+
+       switch (regs.ebx) {
+       case CPUID_VENDOR_INTEL_1:
+               eax_base        = 0x00;
+               feature         = KVM__X86_FEATURE_VMX;
+               break;
+
+       case CPUID_VENDOR_AMD_1:
+               eax_base        = 0x80000000;
+               feature         = KVM__X86_FEATURE_SVM;
+               break;
+
+       default:
+               return false;
+       }
+
+       regs    = (struct cpuid_regs) {
+               .eax            = eax_base,
+       };
+       host_cpuid(&regs);
+
+       if (regs.eax < eax_base + 0x01)
+               return false;
+
+       regs    = (struct cpuid_regs) {
+               .eax            = eax_base + 0x01
+       };
+       host_cpuid(&regs);
+
+       return regs.ecx & (1 << feature);
+}
+
+/*
+ * Allocating RAM size bigger than 4GB requires us to leave a gap
+ * in the RAM which is used for PCI MMIO, hotplug, and unconfigured
+ * devices (see documentation of e820_setup_gap() for details).
+ *
+ * If we're required to initialize RAM bigger than 4GB, we will create
+ * a gap between 0xe0000000 and 0x100000000 in the guest virtual mem space.
+ */
+
+void kvm__init_ram(struct kvm *kvm)
+{
+       u64     phys_start, phys_size;
+       void    *host_mem;
+
+       if (kvm->ram_size < KVM_32BIT_GAP_START) {
+               /* Use a single block of RAM for 32bit RAM */
+
+               phys_start = 0;
+               phys_size  = kvm->ram_size;
+               host_mem   = kvm->ram_start;
+
+               kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+       } else {
+               /* First RAM range from zero to the PCI gap: */
+
+               phys_start = 0;
+               phys_size  = KVM_32BIT_GAP_START;
+               host_mem   = kvm->ram_start;
+
+               kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+
+               /* Second RAM range from 4GB to the end of RAM: */
+
+               phys_start = KVM_32BIT_MAX_MEM_SIZE;
+               phys_size  = kvm->ram_size - phys_start;
+               host_mem   = kvm->ram_start + phys_start;
+
+               kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+       }
+}
+
+/* Arch-specific commandline setup */
+void kvm__arch_set_cmdline(char *cmdline, bool video)
+{
+       strcpy(cmdline, "noapic noacpi pci=conf1 reboot=k panic=1 i8042.direct=1 "
+                               "i8042.dumbkbd=1 i8042.nopnp=1");
+       if (video)
+               strcat(cmdline, " video=vesafb console=tty0");
+       else
+               strcat(cmdline, " console=ttyS0 earlyprintk=serial i8042.noaux=1");
+}
+
+/* Architecture-specific KVM init */
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
+{
+       struct kvm_pit_config pit_config = { .flags = 0, };
+       int ret;
+
+       ret = ioctl(kvm->vm_fd, KVM_SET_TSS_ADDR, 0xfffbd000);
+       if (ret < 0)
+               die_perror("KVM_SET_TSS_ADDR ioctl");
+
+       ret = ioctl(kvm->vm_fd, KVM_CREATE_PIT2, &pit_config);
+       if (ret < 0)
+               die_perror("KVM_CREATE_PIT2 ioctl");
+
+       if (ram_size < KVM_32BIT_GAP_START) {
+               kvm->ram_size = ram_size;
+               kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, ram_size);
+       } else {
+               kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, ram_size + KVM_32BIT_GAP_SIZE);
+               kvm->ram_size = ram_size + KVM_32BIT_GAP_SIZE;
+               if (kvm->ram_start != MAP_FAILED)
+                       /*
+                        * We mprotect the gap (see kvm__init_ram() for details) PROT_NONE so that
+                        * if we accidently write to it, we will know.
+                        */
+                       mprotect(kvm->ram_start + KVM_32BIT_GAP_START, KVM_32BIT_GAP_SIZE, PROT_NONE);
+       }
+       if (kvm->ram_start == MAP_FAILED)
+               die("out of memory");
+
+       madvise(kvm->ram_start, kvm->ram_size, MADV_MERGEABLE);
+
+       ret = ioctl(kvm->vm_fd, KVM_CREATE_IRQCHIP);
+       if (ret < 0)
+               die_perror("KVM_CREATE_IRQCHIP ioctl");
+}
+
+void kvm__arch_delete_ram(struct kvm *kvm)
+{
+       munmap(kvm->ram_start, kvm->ram_size);
+}
+
+void kvm__irq_line(struct kvm *kvm, int irq, int level)
+{
+       struct kvm_irq_level irq_level;
+
+       irq_level       = (struct kvm_irq_level) {
+               {
+                       .irq            = irq,
+               },
+               .level          = level,
+       };
+
+       if (ioctl(kvm->vm_fd, KVM_IRQ_LINE, &irq_level) < 0)
+               die_perror("KVM_IRQ_LINE failed");
+}
+
+void kvm__irq_trigger(struct kvm *kvm, int irq)
+{
+       kvm__irq_line(kvm, irq, 1);
+       kvm__irq_line(kvm, irq, 0);
+}
+
+#define BOOT_LOADER_SELECTOR   0x1000
+#define BOOT_LOADER_IP         0x0000
+#define BOOT_LOADER_SP         0x8000
+#define BOOT_CMDLINE_OFFSET    0x20000
+
+#define BOOT_PROTOCOL_REQUIRED 0x206
+#define LOAD_HIGH              0x01
+
+int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline)
+{
+       void *p;
+       int nr;
+
+       /*
+        * Some architectures may support loading an initrd alongside the flat kernel,
+        * but we do not.
+        */
+       if (fd_initrd != -1)
+               pr_warning("Loading initrd with flat binary not supported.");
+
+       if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+               die_perror("lseek");
+
+       p = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, BOOT_LOADER_IP);
+
+       while ((nr = read(fd_kernel, p, 65536)) > 0)
+               p += nr;
+
+       kvm->boot_selector      = BOOT_LOADER_SELECTOR;
+       kvm->boot_ip            = BOOT_LOADER_IP;
+       kvm->boot_sp            = BOOT_LOADER_SP;
+
+       return true;
+}
+
+static const char *BZIMAGE_MAGIC = "HdrS";
+
+bool load_bzimage(struct kvm *kvm, int fd_kernel,
+                 int fd_initrd, const char *kernel_cmdline, u16 vidmode)
+{
+       struct boot_params *kern_boot;
+       unsigned long setup_sects;
+       struct boot_params boot;
+       size_t cmdline_size;
+       ssize_t setup_size;
+       void *p;
+       int nr;
+
+       /*
+        * See Documentation/x86/boot.txt for details no bzImage on-disk and
+        * memory layout.
+        */
+
+       if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+               die_perror("lseek");
+
+       if (read(fd_kernel, &boot, sizeof(boot)) != sizeof(boot))
+               return false;
+
+       if (memcmp(&boot.hdr.header, BZIMAGE_MAGIC, strlen(BZIMAGE_MAGIC)))
+               return false;
+
+       if (boot.hdr.version < BOOT_PROTOCOL_REQUIRED)
+               die("Too old kernel");
+
+       if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+               die_perror("lseek");
+
+       if (!boot.hdr.setup_sects)
+               boot.hdr.setup_sects = BZ_DEFAULT_SETUP_SECTS;
+       setup_sects = boot.hdr.setup_sects + 1;
+
+       setup_size = setup_sects << 9;
+       p = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, BOOT_LOADER_IP);
+
+       /* copy setup.bin to mem*/
+       if (read(fd_kernel, p, setup_size) != setup_size)
+               die_perror("read");
+
+       /* copy vmlinux.bin to BZ_KERNEL_START*/
+       p = guest_flat_to_host(kvm, BZ_KERNEL_START);
+
+       while ((nr = read(fd_kernel, p, 65536)) > 0)
+               p += nr;
+
+       p = guest_flat_to_host(kvm, BOOT_CMDLINE_OFFSET);
+       if (kernel_cmdline) {
+               cmdline_size = strlen(kernel_cmdline) + 1;
+               if (cmdline_size > boot.hdr.cmdline_size)
+                       cmdline_size = boot.hdr.cmdline_size;
+
+               memset(p, 0, boot.hdr.cmdline_size);
+               memcpy(p, kernel_cmdline, cmdline_size - 1);
+       }
+
+       kern_boot       = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, 0x00);
+
+       kern_boot->hdr.cmd_line_ptr     = BOOT_CMDLINE_OFFSET;
+       kern_boot->hdr.type_of_loader   = 0xff;
+       kern_boot->hdr.heap_end_ptr     = 0xfe00;
+       kern_boot->hdr.loadflags        |= CAN_USE_HEAP;
+       kern_boot->hdr.vid_mode         = vidmode;
+
+       /*
+        * Read initrd image into guest memory
+        */
+       if (fd_initrd >= 0) {
+               struct stat initrd_stat;
+               unsigned long addr;
+
+               if (fstat(fd_initrd, &initrd_stat))
+                       die_perror("fstat");
+
+               addr = boot.hdr.initrd_addr_max & ~0xfffff;
+               for (;;) {
+                       if (addr < BZ_KERNEL_START)
+                               die("Not enough memory for initrd");
+                       else if (addr < (kvm->ram_size - initrd_stat.st_size))
+                               break;
+                       addr -= 0x100000;
+               }
+
+               p = guest_flat_to_host(kvm, addr);
+               nr = read(fd_initrd, p, initrd_stat.st_size);
+               if (nr != initrd_stat.st_size)
+                       die("Failed to read initrd");
+
+               kern_boot->hdr.ramdisk_image    = addr;
+               kern_boot->hdr.ramdisk_size     = initrd_stat.st_size;
+       }
+
+       kvm->boot_selector = BOOT_LOADER_SELECTOR;
+       /*
+        * The real-mode setup code starts at offset 0x200 of a bzImage. See
+        * Documentation/x86/boot.txt for details.
+        */
+       kvm->boot_ip = BOOT_LOADER_IP + 0x200;
+       kvm->boot_sp = BOOT_LOADER_SP;
+
+       return true;
+}
+
+/**
+ * kvm__arch_setup_firmware - inject BIOS into guest system memory
+ * @kvm - guest system descriptor
+ *
+ * This function is a main routine where we poke guest memory
+ * and install BIOS there.
+ */
+int kvm__arch_setup_firmware(struct kvm *kvm)
+{
+       int r;
+
+       /* standart minimal configuration */
+       setup_bios(kvm);
+
+       /* FIXME: SMP, ACPI and friends here */
+
+       /* MP table */
+       r = mptable__init(kvm);
+
+       return r;
+}
+
+int kvm__arch_free_firmware(struct kvm *kvm)
+{
+       int r;
+
+       /* MP table */
+       r = mptable__exit(kvm);
+
+       return r;
+}
+
+void kvm__arch_periodic_poll(struct kvm *kvm)
+{
+       serial8250__update_consoles(kvm);
+       virtio_console__inject_interrupt(kvm);
+}
diff --git a/tools/kvm/x86/mptable.c b/tools/kvm/x86/mptable.c

new file mode 100644 (file)

index 0000000..12bdcf8
--- /dev/null
+++ b/tools/kvm/x86/mptable.c
@@ -0,0 +1,287 @@
+#include "kvm/kvm.h"
+#include "kvm/bios.h"
+#include "kvm/apic.h"
+#include "kvm/mptable.h"
+#include "kvm/util.h"
+#include "kvm/irq.h"
+
+#include <linux/kernel.h>
+#include <string.h>
+
+#include <asm/mpspec_def.h>
+#include <linux/types.h>
+
+/*
+ * FIXME: please make sure the addresses borrowed
+ * for apic/ioapic never overlaped! We need a global
+ * tracker of system resources (including io, mmio,
+ * and friends).
+ */
+
+static unsigned int mpf_checksum(unsigned char *mp, int len)
+{
+       unsigned int sum = 0;
+
+       while (len--)
+               sum += *mp++;
+
+       return sum & 0xFF;
+}
+
+static unsigned int gen_cpu_flag(unsigned int cpu, unsigned int ncpu)
+{
+       /* sets enabled/disabled | BSP/AP processor */
+       return ( (cpu < ncpu) ? CPU_ENABLED       : 0) |
+               ((cpu == 0)   ? CPU_BOOTPROCESSOR : 0x00);
+}
+
+#define MPTABLE_SIG_FLOATING   "_MP_"
+#define MPTABLE_OEM            "KVMCPU00"
+#define MPTABLE_PRODUCTID      "0.1         "
+#define MPTABLE_PCIBUSTYPE     "PCI   "
+#define MPTABLE_ISABUSTYPE     "ISA   "
+
+#define MPTABLE_STRNCPY(d, s)  memcpy(d, s, sizeof(d))
+
+/* It should be more than enough */
+#define MPTABLE_MAX_SIZE       (32 << 20)
+
+/*
+ * Too many cpus will require x2apic mode
+ * and rather ACPI support so we limit it
+ * here for a while.
+ */
+#define MPTABLE_MAX_CPUS       255
+
+static void mptable_add_irq_src(struct mpc_intsrc *mpc_intsrc,
+                               u16 srcbusid,   u16 srcbusirq,
+                               u16 dstapic,    u16 dstirq)
+{
+       *mpc_intsrc = (struct mpc_intsrc) {
+               .type           = MP_INTSRC,
+               .irqtype        = mp_INT,
+               .irqflag        = MP_IRQDIR_DEFAULT,
+               .srcbus         = srcbusid,
+               .srcbusirq      = srcbusirq,
+               .dstapic        = dstapic,
+               .dstirq         = dstirq
+       };
+}
+
+/**
+ * mptable_setup - create mptable and fill guest memory with it
+ */
+int mptable__init(struct kvm *kvm)
+{
+       unsigned long real_mpc_table, real_mpf_intel, size;
+       struct mpf_intel *mpf_intel;
+       struct mpc_table *mpc_table;
+       struct mpc_cpu *mpc_cpu;
+       struct mpc_bus *mpc_bus;
+       struct mpc_ioapic *mpc_ioapic;
+       struct mpc_intsrc *mpc_intsrc;
+       struct rb_node *pci_tree;
+
+       const int pcibusid = 0;
+       const int isabusid = 1;
+
+       unsigned int i, nentries = 0, ncpus = kvm->nrcpus;
+       unsigned int ioapicid;
+       void *last_addr;
+
+       /* That is where MP table will be in guest memory */
+       real_mpc_table = ALIGN(MB_BIOS_BEGIN + bios_rom_size, 16);
+
+       if (ncpus > MPTABLE_MAX_CPUS) {
+               pr_warning("Too many cpus: %d limited to %d",
+                       ncpus, MPTABLE_MAX_CPUS);
+               ncpus = MPTABLE_MAX_CPUS;
+       }
+
+       mpc_table = calloc(1, MPTABLE_MAX_SIZE);
+       if (!mpc_table)
+               return -ENOMEM;
+
+       MPTABLE_STRNCPY(mpc_table->signature,   MPC_SIGNATURE);
+       MPTABLE_STRNCPY(mpc_table->oem,         MPTABLE_OEM);
+       MPTABLE_STRNCPY(mpc_table->productid,   MPTABLE_PRODUCTID);
+
+       mpc_table->spec         = 4;
+       mpc_table->lapic        = APIC_ADDR(0);
+       mpc_table->oemcount     = ncpus; /* will be updated again at end */
+
+       /*
+        * CPUs enumeration. Technically speaking we should
+        * ask either host or HV for apic version supported
+        * but for a while we simply put some random value
+        * here.
+        */
+       mpc_cpu = (void *)&mpc_table[1];
+       for (i = 0; i < ncpus; i++) {
+               mpc_cpu->type           = MP_PROCESSOR;
+               mpc_cpu->apicid         = i;
+               mpc_cpu->apicver        = KVM_APIC_VERSION;
+               mpc_cpu->cpuflag        = gen_cpu_flag(i, ncpus);
+               mpc_cpu->cpufeature     = 0x600; /* some default value */
+               mpc_cpu->featureflag    = 0x201; /* some default value */
+               mpc_cpu++;
+       }
+
+       last_addr = (void *)mpc_cpu;
+       nentries += ncpus;
+
+       /*
+        * PCI buses.
+        * FIXME: Some callback here to obtain real number
+        * of PCI buses present in system.
+        */
+       mpc_bus         = last_addr;
+       mpc_bus->type   = MP_BUS;
+       mpc_bus->busid  = pcibusid;
+       MPTABLE_STRNCPY(mpc_bus->bustype, MPTABLE_PCIBUSTYPE);
+
+       last_addr = (void *)&mpc_bus[1];
+       nentries++;
+
+       /*
+        * ISA bus.
+        * FIXME: Same issue as for PCI bus.
+        */
+       mpc_bus         = last_addr;
+       mpc_bus->type   = MP_BUS;
+       mpc_bus->busid  = isabusid;
+       MPTABLE_STRNCPY(mpc_bus->bustype, MPTABLE_ISABUSTYPE);
+
+       last_addr = (void *)&mpc_bus[1];
+       nentries++;
+
+       /*
+        * IO-APIC chip.
+        */
+       ioapicid                = ncpus + 1;
+       mpc_ioapic              = last_addr;
+       mpc_ioapic->type        = MP_IOAPIC;
+       mpc_ioapic->apicid      = ioapicid;
+       mpc_ioapic->apicver     = KVM_APIC_VERSION;
+       mpc_ioapic->flags       = MPC_APIC_USABLE;
+       mpc_ioapic->apicaddr    = IOAPIC_ADDR(0);
+
+       last_addr = (void *)&mpc_ioapic[1];
+       nentries++;
+
+       /*
+        * IRQ sources.
+        *
+        * FIXME: Same issue as with buses. We definitely
+        * need kind of collector routine which enumerate
+        * resources used first and pass them here.
+        * At moment we know we have only virtio block device
+        * and virtio console but this is g00berfish.
+        *
+        * Also note we use PCI irqs here, no for ISA bus yet.
+        */
+
+       for (pci_tree = irq__get_pci_tree(); pci_tree; pci_tree = rb_next(pci_tree)) {
+               struct pci_dev *dev = rb_entry(pci_tree, struct pci_dev, node);
+               struct irq_line *irq_line;
+
+               list_for_each_entry(irq_line, &dev->lines, node) {
+                       unsigned char srcbusirq;
+
+                       srcbusirq = (dev->id << 2) | (dev->pin - 1);
+
+                       mpc_intsrc = last_addr;
+
+                       mptable_add_irq_src(mpc_intsrc, pcibusid, srcbusirq, ioapicid, irq_line->line);
+                       last_addr = (void *)&mpc_intsrc[1];
+                       nentries++;
+               }
+       }
+
+       /*
+        * Local IRQs assignment (LINT0, LINT1)
+        */
+       mpc_intsrc              = last_addr;
+       mpc_intsrc->type        = MP_LINTSRC;
+       mpc_intsrc->irqtype     = mp_ExtINT;
+       mpc_intsrc->irqtype     = mp_INT;
+       mpc_intsrc->irqflag     = MP_IRQDIR_DEFAULT;
+       mpc_intsrc->srcbus      = isabusid;
+       mpc_intsrc->srcbusirq   = 0;
+       mpc_intsrc->dstapic     = 0; /* FIXME: BSP apic */
+       mpc_intsrc->dstirq      = 0; /* LINT0 */
+
+       last_addr = (void *)&mpc_intsrc[1];
+       nentries++;
+
+       mpc_intsrc              = last_addr;
+       mpc_intsrc->type        = MP_LINTSRC;
+       mpc_intsrc->irqtype     = mp_NMI;
+       mpc_intsrc->irqflag     = MP_IRQDIR_DEFAULT;
+       mpc_intsrc->srcbus      = isabusid;
+       mpc_intsrc->srcbusirq   = 0;
+       mpc_intsrc->dstapic     = 0; /* FIXME: BSP apic */
+       mpc_intsrc->dstirq      = 1; /* LINT1 */
+
+       last_addr = (void *)&mpc_intsrc[1];
+       nentries++;
+
+       /*
+        * Floating MP table finally.
+        */
+       real_mpf_intel  = ALIGN((unsigned long)last_addr - (unsigned long)mpc_table, 16);
+       mpf_intel       = (void *)((unsigned long)mpc_table + real_mpf_intel);
+
+       MPTABLE_STRNCPY(mpf_intel->signature, MPTABLE_SIG_FLOATING);
+       mpf_intel->length       = 1;
+       mpf_intel->specification= 4;
+       mpf_intel->physptr      = (unsigned int)real_mpc_table;
+       mpf_intel->checksum     = -mpf_checksum((unsigned char *)mpf_intel, sizeof(*mpf_intel));
+
+       /*
+        * No last_addr inclrement here please, we need last
+        * active position here to compute table size.
+        */
+
+       /*
+        * Don't forget to update header in fixed table.
+       */
+       mpc_table->oemcount     = nentries;
+       mpc_table->length       = last_addr - (void *)mpc_table;
+       mpc_table->checksum     = -mpf_checksum((unsigned char *)mpc_table, mpc_table->length);
+
+
+       /*
+        * We will copy the whole table, no need to separate
+        * floating structure and table itkvm.
+        */
+       size = (unsigned long)mpf_intel + sizeof(*mpf_intel) - (unsigned long)mpc_table;
+
+       /*
+        * The finial check -- never get out of system bios
+        * area. Lets also check for allocated memory overrun,
+        * in real it's late but still usefull.
+        */
+
+       if (size > (unsigned long)(MB_BIOS_END - bios_rom_size) ||
+           size > MPTABLE_MAX_SIZE) {
+               free(mpc_table);
+               pr_err("MP table is too big");
+
+               return -E2BIG;
+       }
+
+       /*
+        * OK, it is time to move it to guest memory.
+        */
+       memcpy(guest_flat_to_host(kvm, real_mpc_table), mpc_table, size);
+
+       free(mpc_table);
+
+       return 0;
+}
+
+int mptable__exit(struct kvm *kvm)
+{
+       return 0;
+}
diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c

index 5f34aa371b5660c503cfca05e9e0731aba013ea7..b5b4d806ffa25dd32a0a30b4207c15e5c1e22598 100644 (file)
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -31,6 +31,7 @@
  #include <stdarg.h>
  #include <ctype.h>
  #include <errno.h>
+#include <stdint.h>
  
  #include "event-parse.h"
  #include "event-utils.h"
@@ -3485,7 +3486,7 @@ process_defined_func(struct trace_seq *s, void *data, int size,
                         if (!string->str)
                                 die("malloc str");
  
-                       args[i] = (unsigned long long)string->str;
+                       args[i] = (uintptr_t)string->str;
                         strings = string;
                         trace_seq_destroy(&str);
                         break;
@@ -4685,9 +4686,8 @@ static int find_event_handle(struct pevent *pevent, struct event_format *event)
   *
   * /sys/kernel/debug/tracing/events/.../.../format
   */
-int pevent_parse_event(struct pevent *pevent,
-                      const char *buf, unsigned long size,
-                      const char *sys)
+enum pevent_errno pevent_parse_event(struct pevent *pevent, const char *buf,
+                                    unsigned long size, const char *sys)
  {
         struct event_format *event;
         int ret;
@@ -4696,17 +4696,16 @@ int pevent_parse_event(struct pevent *pevent,
  
         event = alloc_event();
         if (!event)
-               return -ENOMEM;
+               return PEVENT_ERRNO__MEM_ALLOC_FAILED;
  
         event->name = event_read_name();
         if (!event->name) {
                 /* Bad event? */
-               free(event);
-               return -1;
+               ret = PEVENT_ERRNO__MEM_ALLOC_FAILED;
+               goto event_alloc_failed;
         }
  
         if (strcmp(sys, "ftrace") == 0) {
-
                 event->flags |= EVENT_FL_ISFTRACE;
  
                 if (strcmp(event->name, "bprint") == 0)
@@ -4714,20 +4713,28 @@ int pevent_parse_event(struct pevent *pevent,
         }
                 
         event->id = event_read_id();
-       if (event->id < 0)
-               die("failed to read event id");
+       if (event->id < 0) {
+               ret = PEVENT_ERRNO__READ_ID_FAILED;
+               /*
+                * This isn't an allocation error actually.
+                * But as the ID is critical, just bail out.
+                */
+               goto event_alloc_failed;
+       }
  
         event->system = strdup(sys);
-       if (!event->system)
-               die("failed to allocate system");
+       if (!event->system) {
+               ret = PEVENT_ERRNO__MEM_ALLOC_FAILED;
+               goto event_alloc_failed;
+       }
  
         /* Add pevent to event so that it can be referenced */
         event->pevent = pevent;
  
         ret = event_read_format(event);
         if (ret < 0) {
-               do_warning("failed to read event format for %s", event->name);
-               goto event_failed;
+               ret = PEVENT_ERRNO__READ_FORMAT_FAILED;
+               goto event_parse_failed;
         }
  
         /*
@@ -4739,10 +4746,9 @@ int pevent_parse_event(struct pevent *pevent,
  
         ret = event_read_print(event);
         if (ret < 0) {
-               do_warning("failed to read event print fmt for %s",
-                          event->name);
                 show_warning = 1;
-               goto event_failed;
+               ret = PEVENT_ERRNO__READ_PRINT_FAILED;
+               goto event_parse_failed;
         }
         show_warning = 1;
  
@@ -4753,20 +4759,19 @@ int pevent_parse_event(struct pevent *pevent,
                 struct print_arg *arg, **list;
  
                 /* old ftrace had no args */
-
                 list = &event->print_fmt.args;
                 for (field = event->format.fields; field; field = field->next) {
                         arg = alloc_arg();
-                       *list = arg;
-                       list = &arg->next;
                         arg->type = PRINT_FIELD;
                         arg->field.name = strdup(field->name);
                         if (!arg->field.name) {
-                               do_warning("failed to allocate field name");
                                 event->flags |= EVENT_FL_FAILED;
-                               return -1;
+                               free_arg(arg);
+                               return PEVENT_ERRNO__OLD_FTRACE_ARG_FAILED;
                         }
                         arg->field.field = field;
+                       *list = arg;
+                       list = &arg->next;
                 }
                 return 0;
         }
@@ -4777,11 +4782,65 @@ int pevent_parse_event(struct pevent *pevent,
  
         return 0;
  
- event_failed:
+ event_parse_failed:
         event->flags |= EVENT_FL_FAILED;
         /* still add it even if it failed */
         add_event(pevent, event);
-       return -1;
+       return ret;
+
+ event_alloc_failed:
+       free(event->system);
+       free(event->name);
+       free(event);
+       return ret;
+}
+
+#undef _PE
+#define _PE(code, str) str
+static const char * const pevent_error_str[] = {
+       PEVENT_ERRORS
+};
+#undef _PE
+
+int pevent_strerror(struct pevent *pevent, enum pevent_errno errnum,
+                   char *buf, size_t buflen)
+{
+       int idx;
+       const char *msg;
+
+       if (errnum >= 0) {
+               msg = strerror_r(errnum, buf, buflen);
+               if (msg != buf) {
+                       size_t len = strlen(msg);
+                       char *c = mempcpy(buf, msg, min(buflen-1, len));
+                       *c = '\0';
+               }
+               return 0;
+       }
+
+       if (errnum <= __PEVENT_ERRNO__START ||
+           errnum >= __PEVENT_ERRNO__END)
+               return -1;
+
+       idx = errnum - __PEVENT_ERRNO__START - 1;
+       msg = pevent_error_str[idx];
+
+       switch (errnum) {
+       case PEVENT_ERRNO__MEM_ALLOC_FAILED:
+       case PEVENT_ERRNO__PARSE_EVENT_FAILED:
+       case PEVENT_ERRNO__READ_ID_FAILED:
+       case PEVENT_ERRNO__READ_FORMAT_FAILED:
+       case PEVENT_ERRNO__READ_PRINT_FAILED:
+       case PEVENT_ERRNO__OLD_FTRACE_ARG_FAILED:
+               snprintf(buf, buflen, "%s", msg);
+               break;
+
+       default:
+               /* cannot reach here */
+               break;
+       }
+
+       return 0;
  }
  
  int get_field_val(struct trace_seq *s, struct format_field *field,
diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h

index 5772ad8cb38646fbd5a6375b528ad710ff217b3f..527df038a25f9010b644c47f38a1b6f867953a74 100644 (file)
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h
@@ -345,6 +345,34 @@ enum pevent_flag {
         PEVENT_NSEC_OUTPUT              = 1,    /* output in NSECS */
  };
  
+#define PEVENT_ERRORS                                                        \
+       _PE(MEM_ALLOC_FAILED,   "failed to allocate memory"),                 \
+       _PE(PARSE_EVENT_FAILED, "failed to parse event"),                     \
+       _PE(READ_ID_FAILED,     "failed to read event id"),                   \
+       _PE(READ_FORMAT_FAILED, "failed to read event format"),               \
+       _PE(READ_PRINT_FAILED,  "failed to read event print fmt"),            \
+       _PE(OLD_FTRACE_ARG_FAILED,"failed to allocate field name for ftrace")
+
+#undef _PE
+#define _PE(__code, __str) PEVENT_ERRNO__ ## __code
+enum pevent_errno {
+       PEVENT_ERRNO__SUCCESS                   = 0,
+
+       /*
+        * Choose an arbitrary negative big number not to clash with standard
+        * errno since SUS requires the errno has distinct positive values.
+        * See 'Issue 6' in the link below.
+        *
+        * http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/errno.h.html
+        */
+       __PEVENT_ERRNO__START                   = -100000,
+
+       PEVENT_ERRORS,
+
+       __PEVENT_ERRNO__END,
+};
+#undef _PE
+
  struct cmdline;
  struct cmdline_list;
  struct func_map;
@@ -509,8 +537,8 @@ void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
  int pevent_parse_header_page(struct pevent *pevent, char *buf, unsigned long size,
                              int long_size);
  
-int pevent_parse_event(struct pevent *pevent, const char *buf,
-                      unsigned long size, const char *sys);
+enum pevent_errno pevent_parse_event(struct pevent *pevent, const char *buf,
+                                    unsigned long size, const char *sys);
  
  void *pevent_get_field_raw(struct trace_seq *s, struct event_format *event,
                            const char *name, struct pevent_record *record,
@@ -561,6 +589,8 @@ int pevent_data_pid(struct pevent *pevent, struct pevent_record *rec);
  const char *pevent_data_comm_from_pid(struct pevent *pevent, int pid);
  void pevent_event_info(struct trace_seq *s, struct event_format *event,
                        struct pevent_record *record);
+int pevent_strerror(struct pevent *pevent, enum pevent_errno errnum,
+                   char *buf, size_t buflen);
  
  struct event_format **pevent_list_events(struct pevent *pevent, enum event_sort_type);
  struct format_field **pevent_event_common_fields(struct event_format *event);
diff --git a/tools/lib/traceevent/event-utils.h b/tools/lib/traceevent/event-utils.h

index 08296383d1e69d2a45d49df818309354017b96a1..bc075006966eb69c9dc8f0fcc7a6d636da413cff 100644 (file)
--- a/tools/lib/traceevent/event-utils.h
+++ b/tools/lib/traceevent/event-utils.h
@@ -39,6 +39,12 @@ void __vdie(const char *fmt, ...);
  void __vwarning(const char *fmt, ...);
  void __vpr_stat(const char *fmt, ...);
  
+#define min(x, y) ({                           \
+       typeof(x) _min1 = (x);                  \
+       typeof(y) _min2 = (y);                  \
+       (void) (&_min1 == &_min2);              \
+       _min1 < _min2 ? _min1 : _min2; })
+
  static inline char *strim(char *string)
  {
         char *ret;
diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile

index ca600e09c8d47f05035763fbd32932294d508c08..9f2e44f2b17a724425a829c2509985b6c8bafc95 100644 (file)
--- a/tools/perf/Documentation/Makefile
+++ b/tools/perf/Documentation/Makefile
@@ -195,10 +195,10 @@ install-pdf: pdf
  #install-html: html
  #      '$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir)
  
-../PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
-       $(QUIET_SUBDIR0)../ $(QUIET_SUBDIR1) PERF-VERSION-FILE
+$(OUTPUT)PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
+       $(QUIET_SUBDIR0)../ $(QUIET_SUBDIR1) $(OUTPUT)PERF-VERSION-FILE
  
--include ../PERF-VERSION-FILE
+-include $(OUTPUT)PERF-VERSION-FILE
  
  #
  # Determine "include::" file references in asciidoc files.
diff --git a/tools/perf/Documentation/jit-interface.txt b/tools/perf/Documentation/jit-interface.txt

new file mode 100644 (file)

index 0000000..a8656f5
--- /dev/null
+++ b/tools/perf/Documentation/jit-interface.txt
@@ -0,0 +1,15 @@
+perf supports a simple JIT interface to resolve symbols for dynamic code generated
+by a JIT.
+
+The JIT has to write a /tmp/perf-%d.map  (%d = pid of process) file
+
+This is a text file.
+
+Each line has the following format, fields separated with spaces:
+
+START SIZE symbolname
+
+START and SIZE are hex numbers without 0x.
+symbolname is the rest of the line, so it could contain special characters.
+
+The ownership of the file has to match the process.
diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt

index ddc22525228da27db7964a0bbfbf5bf4ced41f55..d1e39dc8c81077071cc7e6a1976c0ef7028982e5 100644 (file)
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -15,24 +15,43 @@ DESCRIPTION
  This command displays the symbolic event types which can be selected in the
  various perf commands with the -e option.
  
+[[EVENT_MODIFIERS]]
  EVENT MODIFIERS
  ---------------
  
  Events can optionally have a modifer by appending a colon and one or
-more modifiers.  Modifiers allow the user to restrict when events are
-counted with 'u' for user-space, 'k' for kernel, 'h' for hypervisor.
-Additional modifiers are 'G' for guest counting (in KVM guests) and 'H'
-for host counting (not in KVM guests).
+more modifiers. Modifiers allow the user to restrict the events to be
+counted. The following modifiers exist:
+
+ u - user-space counting
+ k - kernel counting
+ h - hypervisor counting
+ G - guest counting (in KVM guests)
+ H - host counting (not in KVM guests)
+ p - precise level
  
  The 'p' modifier can be used for specifying how precise the instruction
-address should be. The 'p' modifier is currently only implemented for
-Intel PEBS and can be specified multiple times:
-  0 - SAMPLE_IP can have arbitrary skid
-  1 - SAMPLE_IP must have constant skid
-  2 - SAMPLE_IP requested to have 0 skid
-  3 - SAMPLE_IP must have 0 skid
+address should be. The 'p' modifier can be specified multiple times:
+
+ 0 - SAMPLE_IP can have arbitrary skid
+ 1 - SAMPLE_IP must have constant skid
+ 2 - SAMPLE_IP requested to have 0 skid
+ 3 - SAMPLE_IP must have 0 skid
+
+For Intel systems precise event sampling is implemented with PEBS
+which supports up to precise-level 2.
  
-The PEBS implementation now supports up to 2.
+On AMD systems it is implemented using IBS (up to precise-level 2).
+The precise modifier works with event types 0x76 (cpu-cycles, CPU
+clocks not halted) and 0xC1 (micro-ops retired). Both events map to
+IBS execution sampling (IBS op) with the IBS Op Counter Control bit
+(IbsOpCntCtl) set respectively (see AMD64 Architecture Programmer’s
+Manual Volume 2: System Programming, 13.3 Instruction-Based
+Sampling). Examples to use IBS:
+
+ perf record -a -e cpu-cycles:p ...    # use ibs op counting cycles
+ perf record -a -e r076:p ...          # same as -e cpu-cycles:p
+ perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
  
  RAW HARDWARE EVENT DESCRIPTOR
  -----------------------------
@@ -44,6 +63,11 @@ layout of IA32_PERFEVTSELx MSRs (see [Intel® 64 and IA-32 Architectures Softwar
  of IA32_PERFEVTSELx MSRs) or AMD's PerfEvtSeln (see [AMD64 Architecture Programmer’s Manual Volume 2: System Programming], Page 344,
  Figure 13-7 Performance Event-Select Register (PerfEvtSeln)).
  
+Note: Only the following bit fields can be set in x86 counter
+registers: event, umask, edge, inv, cmask. Esp. guest/host only and
+OS/user mode flags must be setup using <<EVENT_MODIFIERS, EVENT
+MODIFIERS>>.
+
  Example:
  
  If the Intel docs for a QM720 Core i7 describe an event as:
@@ -91,4 +115,4 @@ SEE ALSO
  linkperf:perf-stat[1], linkperf:perf-top[1],
  linkperf:perf-record[1],
  http://www.intel.com/Assets/PDF/manual/253669.pdf[Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide],
-http://support.amd.com/us/Processor_TechDocs/24593.pdf[AMD64 Architecture Programmer’s Manual Volume 2: System Programming]
+http://support.amd.com/us/Processor_TechDocs/24593_APM_v2.pdf[AMD64 Architecture Programmer’s Manual Volume 2: System Programming]
diff --git a/tools/perf/Documentation/perf-script-perl.txt b/tools/perf/Documentation/perf-script-perl.txt

index 3152cca15501b19f2ee4608a418e47c082420cfc..d00bef2313407f04b8acaa851edfd80ae007c167 100644 (file)
--- a/tools/perf/Documentation/perf-script-perl.txt
+++ b/tools/perf/Documentation/perf-script-perl.txt
@@ -116,8 +116,8 @@ search path and 'use'ing a few support modules (see module
  descriptions below):
  
  ----
- use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/perf-script-Util/lib";
- use lib "./perf-script-Util/lib";
+ use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/Perf-Trace-Util/lib";
+ use lib "./Perf-Trace-Util/lib";
   use Perf::Trace::Core;
   use Perf::Trace::Context;
   use Perf::Trace::Util;
diff --git a/tools/perf/Documentation/perf-script-python.txt b/tools/perf/Documentation/perf-script-python.txt

index 47102206911921be0d956e910d9efa230a65c3f3..a4027f221a535457c672f8e6d99ccb4fe1db0925 100644 (file)
--- a/tools/perf/Documentation/perf-script-python.txt
+++ b/tools/perf/Documentation/perf-script-python.txt
@@ -129,7 +129,7 @@ import os
  import sys
  
  sys.path.append(os.environ['PERF_EXEC_PATH'] + \
-       '/scripts/python/perf-script-Util/lib/Perf/Trace')
+       '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
  
  from perf_trace_context import *
  from Core import *
@@ -216,7 +216,7 @@ import os
  import sys
  
  sys.path.append(os.environ['PERF_EXEC_PATH'] + \
-       '/scripts/python/perf-script-Util/lib/Perf/Trace')
+       '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
  
  from perf_trace_context import *
  from Core import *
@@ -279,7 +279,7 @@ import os
  import sys
  
  sys.path.append(os.environ['PERF_EXEC_PATH'] + \
-       '/scripts/python/perf-script-Util/lib/Perf/Trace')
+       '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
  
  from perf_trace_context import *
  from Core import *
@@ -391,7 +391,7 @@ drwxr-xr-x 4 trz trz 4096 2010-01-26 22:30 .
  drwxr-xr-x 4 trz trz 4096 2010-01-26 22:29 ..
  drwxr-xr-x 2 trz trz 4096 2010-01-26 22:29 bin
  -rw-r--r-- 1 trz trz 2548 2010-01-26 22:29 check-perf-script.py
-drwxr-xr-x 3 trz trz 4096 2010-01-26 22:49 perf-script-Util
+drwxr-xr-x 3 trz trz 4096 2010-01-26 22:49 Perf-Trace-Util
  -rw-r--r-- 1 trz trz 1462 2010-01-26 22:30 syscall-counts.py
  ----
  
@@ -518,7 +518,7 @@ descriptions below):
   import sys
  
   sys.path.append(os.environ['PERF_EXEC_PATH'] + \
-             '/scripts/python/perf-script-Util/lib/Perf/Trace')
+             '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
  
   from perf_trace_context import *
   from Core import *
diff --git a/tools/perf/Makefile b/tools/perf/Makefile

index 35655c3a7b7a438dd806b9d5a5fb8baa7102876a..722ddee61f9f22730c656e61a59807fedefeccdd 100644 (file)
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -37,7 +37,14 @@ include config/utilities.mak
  #
  # Define NO_NEWT if you do not want TUI support.
  #
+# Define NO_GTK2 if you do not want GTK+ GUI support.
+#
  # Define NO_DEMANGLE if you do not want C++ symbol demangling.
+#
+# Define NO_LIBELF if you do not want libelf dependency (e.g. cross-builds)
+#
+# Define NO_LIBUNWIND if you do not want libunwind dependency for dwarf
+# backtrace post unwind.
  
  $(OUTPUT)PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
         @$(SHELL_PATH) util/PERF-VERSION-GEN $(OUTPUT)
@@ -50,13 +57,16 @@ ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \
                                   -e s/s390x/s390/ -e s/parisc64/parisc/ \
                                   -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \
                                   -e s/sh[234].*/sh/ )
+NO_PERF_REGS := 1
  
  CC = $(CROSS_COMPILE)gcc
  AR = $(CROSS_COMPILE)ar
  
  # Additional ARCH settings for x86
  ifeq ($(ARCH),i386)
-        ARCH := x86
+       ARCH := x86
+       NO_PERF_REGS := 0
+       LIBUNWIND_LIBS = -lunwind -lunwind-x86
  endif
  ifeq ($(ARCH),x86_64)
         ARCH := x86
@@ -69,6 +79,8 @@ ifeq ($(ARCH),x86_64)
                 ARCH_CFLAGS := -DARCH_X86_64
                 ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S ../../arch/x86/lib/memset_64.S
         endif
+       NO_PERF_REGS := 0
+       LIBUNWIND_LIBS = -lunwind -lunwind-x86_64
  endif
  
  # Treat warnings as errors unless directed not to
@@ -89,7 +101,7 @@ ifdef PARSER_DEBUG
         PARSER_DEBUG_CFLAGS := -DPARSER_DEBUG
  endif
  
-CFLAGS = -fno-omit-frame-pointer -ggdb3 -Wall -Wextra -std=gnu99 $(CFLAGS_WERROR) $(CFLAGS_OPTIMIZE) $(EXTRA_WARNINGS) $(EXTRA_CFLAGS) $(PARSER_DEBUG_CFLAGS)
+CFLAGS = -fno-omit-frame-pointer -ggdb3 -funwind-tables -Wall -Wextra -std=gnu99 $(CFLAGS_WERROR) $(CFLAGS_OPTIMIZE) $(EXTRA_WARNINGS) $(EXTRA_CFLAGS) $(PARSER_DEBUG_CFLAGS)
  EXTLIBS = -lpthread -lrt -lelf -lm
  ALL_CFLAGS = $(CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
  ALL_LDFLAGS = $(LDFLAGS)
@@ -186,10 +198,10 @@ SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH))
  
  TRACE_EVENT_DIR = ../lib/traceevent/
  
-ifeq ("$(origin O)", "command line")
-       TE_PATH=$(OUTPUT)/
+ifneq ($(OUTPUT),)
+       TE_PATH=$(OUTPUT)
  else
-       TE_PATH=$(TRACE_EVENT_DIR)/
+       TE_PATH=$(TRACE_EVENT_DIR)
  endif
  
  LIBTRACEEVENT = $(TE_PATH)libtraceevent.a
@@ -321,6 +333,9 @@ LIB_H += $(TRACE_EVENT_DIR)event-parse.h
  LIB_H += util/target.h
  LIB_H += util/rblist.h
  LIB_H += util/intlist.h
+LIB_H += util/perf_regs.h
+LIB_H += util/unwind.h
+LIB_H += ui/helpline.h
  
  LIB_OBJS += $(OUTPUT)util/abspath.o
  LIB_OBJS += $(OUTPUT)util/alias.o
@@ -356,6 +371,7 @@ LIB_OBJS += $(OUTPUT)util/usage.o
  LIB_OBJS += $(OUTPUT)util/wrapper.o
  LIB_OBJS += $(OUTPUT)util/sigchain.o
  LIB_OBJS += $(OUTPUT)util/symbol.o
+LIB_OBJS += $(OUTPUT)util/symbol-elf.o
  LIB_OBJS += $(OUTPUT)util/dso-test-data.o
  LIB_OBJS += $(OUTPUT)util/color.o
  LIB_OBJS += $(OUTPUT)util/pager.o
@@ -387,11 +403,11 @@ LIB_OBJS += $(OUTPUT)util/cgroup.o
  LIB_OBJS += $(OUTPUT)util/target.o
  LIB_OBJS += $(OUTPUT)util/rblist.o
  LIB_OBJS += $(OUTPUT)util/intlist.o
+LIB_OBJS += $(OUTPUT)ui/helpline.o
+LIB_OBJS += $(OUTPUT)ui/stdio/hist.o
  
  BUILTIN_OBJS += $(OUTPUT)builtin-annotate.o
-
  BUILTIN_OBJS += $(OUTPUT)builtin-bench.o
-
  # Benchmark modules
  BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o
  BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o
@@ -449,34 +465,73 @@ PYRF_OBJS += $(OUTPUT)util/xyarray.o
  -include config.mak.autogen
  -include config.mak
  
-ifndef NO_DWARF
-FLAGS_DWARF=$(ALL_CFLAGS) -ldw -lelf $(ALL_LDFLAGS) $(EXTLIBS)
-ifneq ($(call try-cc,$(SOURCE_DWARF),$(FLAGS_DWARF)),y)
-       msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev);
+ifdef NO_LIBELF
         NO_DWARF := 1
-endif # Dwarf support
-endif # NO_DWARF
-
--include arch/$(ARCH)/Makefile
-
-ifneq ($(OUTPUT),)
-       BASIC_CFLAGS += -I$(OUTPUT)
-endif
-
+       NO_DEMANGLE := 1
+       NO_LIBUNWIND := 1
+else
  FLAGS_LIBELF=$(ALL_CFLAGS) $(ALL_LDFLAGS) $(EXTLIBS)
  ifneq ($(call try-cc,$(SOURCE_LIBELF),$(FLAGS_LIBELF)),y)
         FLAGS_GLIBC=$(ALL_CFLAGS) $(ALL_LDFLAGS)
         ifneq ($(call try-cc,$(SOURCE_GLIBC),$(FLAGS_GLIBC)),y)
                 msg := $(error No gnu/libc-version.h found, please install glibc-dev[el]/glibc-static);
         else
-               msg := $(error No libelf.h/libelf found, please install libelf-dev/elfutils-libelf-devel);
+               NO_LIBELF := 1
+               NO_DWARF := 1
+               NO_DEMANGLE := 1
         endif
  endif
+endif # NO_LIBELF
+
+ifndef NO_LIBUNWIND
+# for linking with debug library, run like:
+# make DEBUG=1 LIBUNWIND_DIR=/opt/libunwind/
+ifdef LIBUNWIND_DIR
+       LIBUNWIND_CFLAGS  := -I$(LIBUNWIND_DIR)/include
+       LIBUNWIND_LDFLAGS := -L$(LIBUNWIND_DIR)/lib
+endif
+
+FLAGS_UNWIND=$(LIBUNWIND_CFLAGS) $(ALL_CFLAGS) $(LIBUNWIND_LDFLAGS) $(ALL_LDFLAGS) $(EXTLIBS) $(LIBUNWIND_LIBS)
+ifneq ($(call try-cc,$(SOURCE_LIBUNWIND),$(FLAGS_UNWIND)),y)
+       msg := $(warning No libunwind found, disabling post unwind support. Please install libunwind-dev[el] >= 0.99);
+       NO_LIBUNWIND := 1
+endif # Libunwind support
+endif # NO_LIBUNWIND
+
+-include arch/$(ARCH)/Makefile
+
+ifneq ($(OUTPUT),)
+       BASIC_CFLAGS += -I$(OUTPUT)
+endif
+
+ifdef NO_LIBELF
+BASIC_CFLAGS += -DNO_LIBELF_SUPPORT
+
+EXTLIBS := $(filter-out -lelf,$(EXTLIBS))
+
+# Remove ELF/DWARF dependent codes
+LIB_OBJS := $(filter-out $(OUTPUT)util/symbol-elf.o,$(LIB_OBJS))
+LIB_OBJS := $(filter-out $(OUTPUT)util/dwarf-aux.o,$(LIB_OBJS))
+LIB_OBJS := $(filter-out $(OUTPUT)util/probe-event.o,$(LIB_OBJS))
+LIB_OBJS := $(filter-out $(OUTPUT)util/probe-finder.o,$(LIB_OBJS))
+
+BUILTIN_OBJS := $(filter-out $(OUTPUT)builtin-probe.o,$(BUILTIN_OBJS))
+
+# Use minimal symbol handling
+LIB_OBJS += $(OUTPUT)util/symbol-minimal.o
+
+else # NO_LIBELF
  
  ifneq ($(call try-cc,$(SOURCE_ELF_MMAP),$(FLAGS_COMMON)),y)
         BASIC_CFLAGS += -DLIBELF_NO_MMAP
  endif
  
+FLAGS_DWARF=$(ALL_CFLAGS) -ldw -lelf $(ALL_LDFLAGS) $(EXTLIBS)
+ifneq ($(call try-cc,$(SOURCE_DWARF),$(FLAGS_DWARF)),y)
+       msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev);
+       NO_DWARF := 1
+endif # Dwarf support
+
  ifndef NO_DWARF
  ifeq ($(origin PERF_HAVE_DWARF_REGS), undefined)
         msg := $(warning DWARF register mappings have not been defined for architecture $(ARCH), DWARF support disabled);
@@ -487,6 +542,16 @@ else
         LIB_OBJS += $(OUTPUT)util/dwarf-aux.o
  endif # PERF_HAVE_DWARF_REGS
  endif # NO_DWARF
+endif # NO_LIBELF
+
+ifdef NO_LIBUNWIND
+       BASIC_CFLAGS += -DNO_LIBUNWIND_SUPPORT
+else
+       EXTLIBS += $(LIBUNWIND_LIBS)
+       BASIC_CFLAGS := $(LIBUNWIND_CFLAGS) $(BASIC_CFLAGS)
+       BASIC_LDFLAGS := $(LIBUNWIND_LDFLAGS) $(BASIC_LDFLAGS)
+       LIB_OBJS += $(OUTPUT)util/unwind.o
+endif
  
  ifdef NO_NEWT
         BASIC_CFLAGS += -DNO_NEWT_SUPPORT
@@ -504,14 +569,13 @@ else
                 LIB_OBJS += $(OUTPUT)ui/browsers/annotate.o
                 LIB_OBJS += $(OUTPUT)ui/browsers/hists.o
                 LIB_OBJS += $(OUTPUT)ui/browsers/map.o
-               LIB_OBJS += $(OUTPUT)ui/helpline.o
                 LIB_OBJS += $(OUTPUT)ui/progress.o
                 LIB_OBJS += $(OUTPUT)ui/util.o
                 LIB_OBJS += $(OUTPUT)ui/tui/setup.o
                 LIB_OBJS += $(OUTPUT)ui/tui/util.o
+               LIB_OBJS += $(OUTPUT)ui/tui/helpline.o
                 LIB_H += ui/browser.h
                 LIB_H += ui/browsers/map.h
-               LIB_H += ui/helpline.h
                 LIB_H += ui/keysyms.h
                 LIB_H += ui/libslang.h
                 LIB_H += ui/progress.h
@@ -523,7 +587,7 @@ endif
  ifdef NO_GTK2
         BASIC_CFLAGS += -DNO_GTK2_SUPPORT
  else
-       FLAGS_GTK2=$(ALL_CFLAGS) $(ALL_LDFLAGS) $(EXTLIBS) $(shell pkg-config --libs --cflags gtk+-2.0)
+       FLAGS_GTK2=$(ALL_CFLAGS) $(ALL_LDFLAGS) $(EXTLIBS) $(shell pkg-config --libs --cflags gtk+-2.0 2>/dev/null)
         ifneq ($(call try-cc,$(SOURCE_GTK2),$(FLAGS_GTK2)),y)
                 msg := $(warning GTK2 not found, disables GTK2 support. Please install gtk2-devel or libgtk2.0-dev);
                 BASIC_CFLAGS += -DNO_GTK2_SUPPORT
@@ -531,11 +595,12 @@ else
                 ifeq ($(call try-cc,$(SOURCE_GTK2_INFOBAR),$(FLAGS_GTK2)),y)
                         BASIC_CFLAGS += -DHAVE_GTK_INFO_BAR
                 endif
-               BASIC_CFLAGS += $(shell pkg-config --cflags gtk+-2.0)
-               EXTLIBS += $(shell pkg-config --libs gtk+-2.0)
+               BASIC_CFLAGS += $(shell pkg-config --cflags gtk+-2.0 2>/dev/null)
+               EXTLIBS += $(shell pkg-config --libs gtk+-2.0 2>/dev/null)
                 LIB_OBJS += $(OUTPUT)ui/gtk/browser.o
                 LIB_OBJS += $(OUTPUT)ui/gtk/setup.o
                 LIB_OBJS += $(OUTPUT)ui/gtk/util.o
+               LIB_OBJS += $(OUTPUT)ui/gtk/helpline.o
                 # Make sure that it'd be included only once.
                 ifneq ($(findstring -DNO_NEWT_SUPPORT,$(BASIC_CFLAGS)),)
                         LIB_OBJS += $(OUTPUT)ui/setup.o
@@ -674,6 +739,13 @@ else
         endif
  endif
  
+ifeq ($(NO_PERF_REGS),0)
+       ifeq ($(ARCH),x86)
+               LIB_H += arch/x86/include/perf_regs.h
+       endif
+else
+       BASIC_CFLAGS += -DNO_PERF_REGS
+endif
  
  ifdef NO_STRLCPY
         BASIC_CFLAGS += -DNO_STRLCPY
@@ -700,6 +772,7 @@ perfexecdir_SQ = $(subst ','\'',$(perfexecdir))
  template_dir_SQ = $(subst ','\'',$(template_dir))
  htmldir_SQ = $(subst ','\'',$(htmldir))
  prefix_SQ = $(subst ','\'',$(prefix))
+sysconfdir_SQ = $(subst ','\'',$(sysconfdir))
  
  SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
  
@@ -767,10 +840,10 @@ $(OUTPUT)perf.o perf.spec \
  # over the general rule for .o
  
  $(OUTPUT)util/%-flex.o: $(OUTPUT)util/%-flex.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -Iutil/ -w $<
+       $(QUIET_CC)$(CC) -o $@ -c -Iutil/ $(ALL_CFLAGS) -w $<
  
  $(OUTPUT)util/%-bison.o: $(OUTPUT)util/%-bison.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -Iutil/ -w $<
+       $(QUIET_CC)$(CC) -o $@ -c -Iutil/ $(ALL_CFLAGS) -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w $<
  
  $(OUTPUT)%.o: %.c $(OUTPUT)PERF-CFLAGS
         $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) $<
@@ -842,7 +915,7 @@ $(LIB_FILE): $(LIB_OBJS)
  
  # libtraceevent.a
  $(LIBTRACEEVENT):
-       $(QUIET_SUBDIR0)$(TRACE_EVENT_DIR) $(QUIET_SUBDIR1) $(COMMAND_O) libtraceevent.a
+       $(QUIET_SUBDIR0)$(TRACE_EVENT_DIR) $(QUIET_SUBDIR1) O=$(OUTPUT) libtraceevent.a
  
  help:
         @echo 'Perf make targets:'
@@ -951,6 +1024,8 @@ install: all
         $(INSTALL) scripts/python/Perf-Trace-Util/lib/Perf/Trace/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/Perf-Trace-Util/lib/Perf/Trace'
         $(INSTALL) scripts/python/*.py -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python'
         $(INSTALL) scripts/python/bin/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/bin'
+       $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(sysconfdir_SQ)/bash_completion.d'
+       $(INSTALL) bash_completion '$(DESTDIR_SQ)$(sysconfdir_SQ)/bash_completion.d/perf'
  
  install-python_ext:
         $(PYTHON_WORD) util/setup.py --quiet install --root='/$(DESTDIR_SQ)'
diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile

index 744e629797be9cdbe39b7285d71893b8030795aa..815841c04eb2f9b6db3ebeb3692711559d74e655 100644 (file)
--- a/tools/perf/arch/x86/Makefile
+++ b/tools/perf/arch/x86/Makefile
@@ -2,4 +2,7 @@ ifndef NO_DWARF
  PERF_HAVE_DWARF_REGS := 1
  LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
  endif
+ifndef NO_LIBUNWIND
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind.o
+endif
  LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/header.o
diff --git a/tools/perf/arch/x86/include/perf_regs.h b/tools/perf/arch/x86/include/perf_regs.h

new file mode 100644 (file)

index 0000000..46fc9f1
--- /dev/null
+++ b/tools/perf/arch/x86/include/perf_regs.h
@@ -0,0 +1,80 @@
+#ifndef ARCH_PERF_REGS_H
+#define ARCH_PERF_REGS_H
+
+#include <stdlib.h>
+#include "../../util/types.h"
+#include "../../../../../arch/x86/include/asm/perf_regs.h"
+
+#ifndef ARCH_X86_64
+#define PERF_REGS_MASK ((1ULL << PERF_REG_X86_32_MAX) - 1)
+#else
+#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \
+                      (1ULL << PERF_REG_X86_ES) | \
+                      (1ULL << PERF_REG_X86_FS) | \
+                      (1ULL << PERF_REG_X86_GS))
+#define PERF_REGS_MASK (((1ULL << PERF_REG_X86_64_MAX) - 1) & ~REG_NOSUPPORT)
+#endif
+#define PERF_REG_IP PERF_REG_X86_IP
+#define PERF_REG_SP PERF_REG_X86_SP
+
+static inline const char *perf_reg_name(int id)
+{
+       switch (id) {
+       case PERF_REG_X86_AX:
+               return "AX";
+       case PERF_REG_X86_BX:
+               return "BX";
+       case PERF_REG_X86_CX:
+               return "CX";
+       case PERF_REG_X86_DX:
+               return "DX";
+       case PERF_REG_X86_SI:
+               return "SI";
+       case PERF_REG_X86_DI:
+               return "DI";
+       case PERF_REG_X86_BP:
+               return "BP";
+       case PERF_REG_X86_SP:
+               return "SP";
+       case PERF_REG_X86_IP:
+               return "IP";
+       case PERF_REG_X86_FLAGS:
+               return "FLAGS";
+       case PERF_REG_X86_CS:
+               return "CS";
+       case PERF_REG_X86_SS:
+               return "SS";
+       case PERF_REG_X86_DS:
+               return "DS";
+       case PERF_REG_X86_ES:
+               return "ES";
+       case PERF_REG_X86_FS:
+               return "FS";
+       case PERF_REG_X86_GS:
+               return "GS";
+#ifdef ARCH_X86_64
+       case PERF_REG_X86_R8:
+               return "R8";
+       case PERF_REG_X86_R9:
+               return "R9";
+       case PERF_REG_X86_R10:
+               return "R10";
+       case PERF_REG_X86_R11:
+               return "R11";
+       case PERF_REG_X86_R12:
+               return "R12";
+       case PERF_REG_X86_R13:
+               return "R13";
+       case PERF_REG_X86_R14:
+               return "R14";
+       case PERF_REG_X86_R15:
+               return "R15";
+#endif /* ARCH_X86_64 */
+       default:
+               return NULL;
+       }
+
+       return NULL;
+}
+
+#endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/x86/util/unwind.c b/tools/perf/arch/x86/util/unwind.c

new file mode 100644 (file)

index 0000000..78d956e
--- /dev/null
+++ b/tools/perf/arch/x86/util/unwind.c
@@ -0,0 +1,111 @@
+
+#include <errno.h>
+#include <libunwind.h>
+#include "perf_regs.h"
+#include "../../util/unwind.h"
+
+#ifdef ARCH_X86_64
+int unwind__arch_reg_id(int regnum)
+{
+       int id;
+
+       switch (regnum) {
+       case UNW_X86_64_RAX:
+               id = PERF_REG_X86_AX;
+               break;
+       case UNW_X86_64_RDX:
+               id = PERF_REG_X86_DX;
+               break;
+       case UNW_X86_64_RCX:
+               id = PERF_REG_X86_CX;
+               break;
+       case UNW_X86_64_RBX:
+               id = PERF_REG_X86_BX;
+               break;
+       case UNW_X86_64_RSI:
+               id = PERF_REG_X86_SI;
+               break;
+       case UNW_X86_64_RDI:
+               id = PERF_REG_X86_DI;
+               break;
+       case UNW_X86_64_RBP:
+               id = PERF_REG_X86_BP;
+               break;
+       case UNW_X86_64_RSP:
+               id = PERF_REG_X86_SP;
+               break;
+       case UNW_X86_64_R8:
+               id = PERF_REG_X86_R8;
+               break;
+       case UNW_X86_64_R9:
+               id = PERF_REG_X86_R9;
+               break;
+       case UNW_X86_64_R10:
+               id = PERF_REG_X86_R10;
+               break;
+       case UNW_X86_64_R11:
+               id = PERF_REG_X86_R11;
+               break;
+       case UNW_X86_64_R12:
+               id = PERF_REG_X86_R12;
+               break;
+       case UNW_X86_64_R13:
+               id = PERF_REG_X86_R13;
+               break;
+       case UNW_X86_64_R14:
+               id = PERF_REG_X86_R14;
+               break;
+       case UNW_X86_64_R15:
+               id = PERF_REG_X86_R15;
+               break;
+       case UNW_X86_64_RIP:
+               id = PERF_REG_X86_IP;
+               break;
+       default:
+               pr_err("unwind: invalid reg id %d\n", regnum);
+               return -EINVAL;
+       }
+
+       return id;
+}
+#else
+int unwind__arch_reg_id(int regnum)
+{
+       int id;
+
+       switch (regnum) {
+       case UNW_X86_EAX:
+               id = PERF_REG_X86_AX;
+               break;
+       case UNW_X86_EDX:
+               id = PERF_REG_X86_DX;
+               break;
+       case UNW_X86_ECX:
+               id = PERF_REG_X86_CX;
+               break;
+       case UNW_X86_EBX:
+               id = PERF_REG_X86_BX;
+               break;
+       case UNW_X86_ESI:
+               id = PERF_REG_X86_SI;
+               break;
+       case UNW_X86_EDI:
+               id = PERF_REG_X86_DI;
+               break;
+       case UNW_X86_EBP:
+               id = PERF_REG_X86_BP;
+               break;
+       case UNW_X86_ESP:
+               id = PERF_REG_X86_SP;
+               break;
+       case UNW_X86_EIP:
+               id = PERF_REG_X86_IP;
+               break;
+       default:
+               pr_err("unwind: invalid reg id %d\n", regnum);
+               return -EINVAL;
+       }
+
+       return id;
+}
+#endif /* ARCH_X86_64 */
diff --git a/tools/perf/bash_completion b/tools/perf/bash_completion

new file mode 100644 (file)

index 0000000..1958fa5
--- /dev/null
+++ b/tools/perf/bash_completion
@@ -0,0 +1,26 @@
+# perf completion
+
+have perf &&
+_perf()
+{
+       local cur cmd
+
+       COMPREPLY=()
+       _get_comp_words_by_ref cur prev
+
+       cmd=${COMP_WORDS[0]}
+
+       # List perf subcommands
+       if [ $COMP_CWORD -eq 1 ]; then
+               cmds=$($cmd --list-cmds)
+               COMPREPLY=( $( compgen -W '$cmds' -- "$cur" ) )
+       # List possible events for -e option
+       elif [[ $prev == "-e" && "${COMP_WORDS[1]}" == @(record|stat|top) ]]; then
+               cmds=$($cmd list --raw-dump)
+               COMPREPLY=( $( compgen -W '$cmds' -- "$cur" ) )
+       # Fall down to list regular files
+       else
+               _filedir
+       fi
+} &&
+complete -F _perf perf
diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c

index 6b2bcfbde150870ce25de5d037de00246caa2730..7d6842826a0cfb230f2d9aaef5e7709e215721cf 100644 (file)
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c
@@ -16,8 +16,6 @@
  #include "util/session.h"
  #include "util/symbol.h"
  
-#include <libelf.h>
-
  static const char *input_name;
  static bool force;
  static bool show_kernel;
@@ -71,7 +69,7 @@ static int perf_session__list_build_ids(void)
  {
         struct perf_session *session;
  
-       elf_version(EV_CURRENT);
+       symbol__elf_init();
  
         session = perf_session__new(input_name, O_RDONLY, force, false,
                                     &build_id__mark_dso_hit_ops);
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c

index 3beab489afc5c69446c5ce37fefe5fa40cfde9b2..64d8ba2fb7bc34c258ace98a4e01011dbdcabc18 100644 (file)
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -191,10 +191,13 @@ static int perf_event__inject_buildid(struct perf_tool *tool,
                                  * If this fails, too bad, let the other side
                                  * account this as unresolved.
                                  */
-                       } else
+                       } else {
+#ifndef NO_LIBELF_SUPPORT
                                 pr_warning("no symbols found in %s, maybe "
                                            "install a debug package?\n",
                                            al.map->dso->long_name);
+#endif
+                       }
                 }
         }
  
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c

index ce35015f2dc6423901f7d42b223754b4b67117e3..fc6607b383f276329067c5d267083027ebd358af 100644 (file)
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -1,6 +1,7 @@
  #include "builtin.h"
  #include "perf.h"
  
+#include "util/evsel.h"
  #include "util/util.h"
  #include "util/cache.h"
  #include "util/symbol.h"
@@ -57,11 +58,6 @@ static unsigned long nr_allocs, nr_cross_allocs;
  
  #define PATH_SYS_NODE  "/sys/devices/system/node"
  
-struct perf_kmem {
-       struct perf_tool    tool;
-       struct perf_session *session;
-};
-
  static void init_cpunode_map(void)
  {
         FILE *fp;
@@ -196,16 +192,15 @@ static void insert_caller_stat(unsigned long call_site,
         }
  }
  
-static void process_alloc_event(void *data,
-                               struct event_format *event,
-                               int cpu,
-                               u64 timestamp __used,
-                               struct thread *thread __used,
-                               int node)
+static void perf_evsel__process_alloc_event(struct perf_evsel *evsel,
+                                           struct perf_sample *sample,
+                                           int node)
  {
+       struct event_format *event = evsel->tp_format;
+       void *data = sample->raw_data;
         unsigned long call_site;
         unsigned long ptr;
-       int bytes_req;
+       int bytes_req, cpu = sample->cpu;
         int bytes_alloc;
         int node1, node2;
  
@@ -257,22 +252,18 @@ static struct alloc_stat *search_alloc_stat(unsigned long ptr,
         return NULL;
  }
  
-static void process_free_event(void *data,
-                              struct event_format *event,
-                              int cpu,
-                              u64 timestamp __used,
-                              struct thread *thread __used)
+static void perf_evsel__process_free_event(struct perf_evsel *evsel,
+                                          struct perf_sample *sample)
  {
-       unsigned long ptr;
+       unsigned long ptr = raw_field_value(evsel->tp_format, "ptr",
+                                           sample->raw_data);
         struct alloc_stat *s_alloc, *s_caller;
  
-       ptr = raw_field_value(event, "ptr", data);
-
         s_alloc = search_alloc_stat(ptr, 0, &root_alloc_stat, ptr_cmp);
         if (!s_alloc)
                 return;
  
-       if (cpu != s_alloc->alloc_cpu) {
+       if ((short)sample->cpu != s_alloc->alloc_cpu) {
                 s_alloc->pingpong++;
  
                 s_caller = search_alloc_stat(0, s_alloc->call_site,
@@ -283,40 +274,34 @@ static void process_free_event(void *data,
         s_alloc->alloc_cpu = -1;
  }
  
-static void process_raw_event(struct perf_tool *tool,
-                             union perf_event *raw_event __used, void *data,
-                             int cpu, u64 timestamp, struct thread *thread)
+static void perf_evsel__process_kmem_event(struct perf_evsel *evsel,
+                                          struct perf_sample *sample)
  {
-       struct perf_kmem *kmem = container_of(tool, struct perf_kmem, tool);
-       struct event_format *event;
-       int type;
-
-       type = trace_parse_common_type(kmem->session->pevent, data);
-       event = pevent_find_event(kmem->session->pevent, type);
+       struct event_format *event = evsel->tp_format;
  
         if (!strcmp(event->name, "kmalloc") ||
             !strcmp(event->name, "kmem_cache_alloc")) {
-               process_alloc_event(data, event, cpu, timestamp, thread, 0);
+               perf_evsel__process_alloc_event(evsel, sample, 0);
                 return;
         }
  
         if (!strcmp(event->name, "kmalloc_node") ||
             !strcmp(event->name, "kmem_cache_alloc_node")) {
-               process_alloc_event(data, event, cpu, timestamp, thread, 1);
+               perf_evsel__process_alloc_event(evsel, sample, 1);
                 return;
         }
  
         if (!strcmp(event->name, "kfree") ||
             !strcmp(event->name, "kmem_cache_free")) {
-               process_free_event(data, event, cpu, timestamp, thread);
+               perf_evsel__process_free_event(evsel, sample);
                 return;
         }
  }
  
-static int process_sample_event(struct perf_tool *tool,
+static int process_sample_event(struct perf_tool *tool __used,
                                 union perf_event *event,
                                 struct perf_sample *sample,
-                               struct perf_evsel *evsel __used,
+                               struct perf_evsel *evsel,
                                 struct machine *machine)
  {
         struct thread *thread = machine__findnew_thread(machine, event->ip.pid);
@@ -329,18 +314,14 @@ static int process_sample_event(struct perf_tool *tool,
  
         dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
  
-       process_raw_event(tool, event, sample->raw_data, sample->cpu,
-                         sample->time, thread);
-
+       perf_evsel__process_kmem_event(evsel, sample);
         return 0;
  }
  
-static struct perf_kmem perf_kmem = {
-       .tool = {
-               .sample                 = process_sample_event,
-               .comm                   = perf_event__process_comm,
-               .ordered_samples        = true,
-       },
+static struct perf_tool perf_kmem = {
+       .sample          = process_sample_event,
+       .comm            = perf_event__process_comm,
+       .ordered_samples = true,
  };
  
  static double fragmentation(unsigned long n_req, unsigned long n_alloc)
@@ -497,13 +478,10 @@ static int __cmd_kmem(void)
         int err = -EINVAL;
         struct perf_session *session;
  
-       session = perf_session__new(input_name, O_RDONLY, 0, false,
-                                   &perf_kmem.tool);
+       session = perf_session__new(input_name, O_RDONLY, 0, false, &perf_kmem);
         if (session == NULL)
                 return -ENOMEM;
  
-       perf_kmem.session = session;
-
         if (perf_session__create_kernel_maps(session) < 0)
                 goto out_delete;
  
@@ -511,7 +489,7 @@ static int __cmd_kmem(void)
                 goto out_delete;
  
         setup_pager();
-       err = perf_session__process_events(session, &perf_kmem.tool);
+       err = perf_session__process_events(session, &perf_kmem);
         if (err != 0)
                 goto out_delete;
         sort_result();
diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c

index 6313b6eb3ebbff85f1527692d93416adf9845c31..bdcff81b532a0fe0af5dfa138a667a27294847bc 100644 (file)
--- a/tools/perf/builtin-list.c
+++ b/tools/perf/builtin-list.c
@@ -19,15 +19,15 @@ int cmd_list(int argc, const char **argv, const char *prefix __used)
         setup_pager();
  
         if (argc == 1)
-               print_events(NULL);
+               print_events(NULL, false);
         else {
                 int i;
  
                 for (i = 1; i < argc; ++i) {
-                       if (i > 1)
+                       if (i > 2)
                                 putchar('\n');
                         if (strncmp(argv[i], "tracepoint", 10) == 0)
-                               print_tracepoint_events(NULL, NULL);
+                               print_tracepoint_events(NULL, NULL, false);
                         else if (strcmp(argv[i], "hw") == 0 ||
                                  strcmp(argv[i], "hardware") == 0)
                                 print_events_type(PERF_TYPE_HARDWARE);
@@ -36,13 +36,15 @@ int cmd_list(int argc, const char **argv, const char *prefix __used)
                                 print_events_type(PERF_TYPE_SOFTWARE);
                         else if (strcmp(argv[i], "cache") == 0 ||
                                  strcmp(argv[i], "hwcache") == 0)
-                               print_hwcache_events(NULL);
+                               print_hwcache_events(NULL, false);
+                       else if (strcmp(argv[i], "--raw-dump") == 0)
+                               print_events(NULL, true);
                         else {
                                 char *sep = strchr(argv[i], ':'), *s;
                                 int sep_idx;
  
                                 if (sep == NULL) {
-                                       print_events(argv[i]);
+                                       print_events(argv[i], false);
                                         continue;
                                 }
                                 sep_idx = sep - argv[i];
@@ -51,7 +53,7 @@ int cmd_list(int argc, const char **argv, const char *prefix __used)
                                         return -1;
  
                                 s[sep_idx] = '\0';
-                               print_tracepoint_events(s, s + sep_idx + 1);
+                               print_tracepoint_events(s, s + sep_idx + 1, false);
                                 free(s);
                         }
                 }
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c

index b3c4285488688ba19b6073bd6c743a84e4e0e8b1..585aae2858b862f143ebcc0eb64e76c448c287b7 100644 (file)
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -1,6 +1,7 @@
  #include "builtin.h"
  #include "perf.h"
  
+#include "util/evsel.h"
  #include "util/util.h"
  #include "util/cache.h"
  #include "util/symbol.h"
@@ -356,28 +357,16 @@ struct trace_release_event {
  
  struct trace_lock_handler {
         void (*acquire_event)(struct trace_acquire_event *,
-                             struct event_format *,
-                             int cpu,
-                             u64 timestamp,
-                             struct thread *thread);
+                             const struct perf_sample *sample);
  
         void (*acquired_event)(struct trace_acquired_event *,
-                              struct event_format *,
-                              int cpu,
-                              u64 timestamp,
-                              struct thread *thread);
+                              const struct perf_sample *sample);
  
         void (*contended_event)(struct trace_contended_event *,
-                               struct event_format *,
-                               int cpu,
-                               u64 timestamp,
-                               struct thread *thread);
+                               const struct perf_sample *sample);
  
         void (*release_event)(struct trace_release_event *,
-                             struct event_format *,
-                             int cpu,
-                             u64 timestamp,
-                             struct thread *thread);
+                             const struct perf_sample *sample);
  };
  
  static struct lock_seq_stat *get_seq(struct thread_stat *ts, void *addr)
@@ -416,10 +405,7 @@ enum acquire_flags {
  
  static void
  report_lock_acquire_event(struct trace_acquire_event *acquire_event,
-                       struct event_format *__event __used,
-                       int cpu __used,
-                       u64 timestamp __used,
-                       struct thread *thread __used)
+                         const struct perf_sample *sample)
  {
         struct lock_stat *ls;
         struct thread_stat *ts;
@@ -429,7 +415,7 @@ report_lock_acquire_event(struct trace_acquire_event *acquire_event,
         if (ls->discard)
                 return;
  
-       ts = thread_stat_findnew(thread->pid);
+       ts = thread_stat_findnew(sample->tid);
         seq = get_seq(ts, acquire_event->addr);
  
         switch (seq->state) {
@@ -473,18 +459,16 @@ broken:
         }
  
         ls->nr_acquire++;
-       seq->prev_event_time = timestamp;
+       seq->prev_event_time = sample->time;
  end:
         return;
  }
  
  static void
  report_lock_acquired_event(struct trace_acquired_event *acquired_event,
-                        struct event_format *__event __used,
-                        int cpu __used,
-                        u64 timestamp __used,
-                        struct thread *thread __used)
+                          const struct perf_sample *sample)
  {
+       u64 timestamp = sample->time;
         struct lock_stat *ls;
         struct thread_stat *ts;
         struct lock_seq_stat *seq;
@@ -494,7 +478,7 @@ report_lock_acquired_event(struct trace_acquired_event *acquired_event,
         if (ls->discard)
                 return;
  
-       ts = thread_stat_findnew(thread->pid);
+       ts = thread_stat_findnew(sample->tid);
         seq = get_seq(ts, acquired_event->addr);
  
         switch (seq->state) {
@@ -536,10 +520,7 @@ end:
  
  static void
  report_lock_contended_event(struct trace_contended_event *contended_event,
-                         struct event_format *__event __used,
-                         int cpu __used,
-                         u64 timestamp __used,
-                         struct thread *thread __used)
+                           const struct perf_sample *sample)
  {
         struct lock_stat *ls;
         struct thread_stat *ts;
@@ -549,7 +530,7 @@ report_lock_contended_event(struct trace_contended_event *contended_event,
         if (ls->discard)
                 return;
  
-       ts = thread_stat_findnew(thread->pid);
+       ts = thread_stat_findnew(sample->tid);
         seq = get_seq(ts, contended_event->addr);
  
         switch (seq->state) {
@@ -576,17 +557,14 @@ report_lock_contended_event(struct trace_contended_event *contended_event,
  
         seq->state = SEQ_STATE_CONTENDED;
         ls->nr_contended++;
-       seq->prev_event_time = timestamp;
+       seq->prev_event_time = sample->time;
  end:
         return;
  }
  
  static void
  report_lock_release_event(struct trace_release_event *release_event,
-                       struct event_format *__event __used,
-                       int cpu __used,
-                       u64 timestamp __used,
-                       struct thread *thread __used)
+                         const struct perf_sample *sample)
  {
         struct lock_stat *ls;
         struct thread_stat *ts;
@@ -596,7 +574,7 @@ report_lock_release_event(struct trace_release_event *release_event,
         if (ls->discard)
                 return;
  
-       ts = thread_stat_findnew(thread->pid);
+       ts = thread_stat_findnew(sample->tid);
         seq = get_seq(ts, release_event->addr);
  
         switch (seq->state) {
@@ -645,14 +623,12 @@ static struct trace_lock_handler report_lock_ops  = {
  
  static struct trace_lock_handler *trace_handler;
  
-static void
-process_lock_acquire_event(void *data,
-                          struct event_format *event __used,
-                          int cpu __used,
-                          u64 timestamp __used,
-                          struct thread *thread __used)
+static void perf_evsel__process_lock_acquire(struct perf_evsel *evsel,
+                                            struct perf_sample *sample)
  {
         struct trace_acquire_event acquire_event;
+       struct event_format *event = evsel->tp_format;
+       void *data = sample->raw_data;
         u64 tmp;                /* this is required for casting... */
  
         tmp = raw_field_value(event, "lockdep_addr", data);
@@ -661,17 +637,15 @@ process_lock_acquire_event(void *data,
         acquire_event.flag = (int)raw_field_value(event, "flag", data);
  
         if (trace_handler->acquire_event)
-               trace_handler->acquire_event(&acquire_event, event, cpu, timestamp, thread);
+               trace_handler->acquire_event(&acquire_event, sample);
  }
  
-static void
-process_lock_acquired_event(void *data,
-                           struct event_format *event __used,
-                           int cpu __used,
-                           u64 timestamp __used,
-                           struct thread *thread __used)
+static void perf_evsel__process_lock_acquired(struct perf_evsel *evsel,
+                                             struct perf_sample *sample)
  {
         struct trace_acquired_event acquired_event;
+       struct event_format *event = evsel->tp_format;
+       void *data = sample->raw_data;
         u64 tmp;                /* this is required for casting... */
  
         tmp = raw_field_value(event, "lockdep_addr", data);
@@ -679,17 +653,15 @@ process_lock_acquired_event(void *data,
         acquired_event.name = (char *)raw_field_ptr(event, "name", data);
  
         if (trace_handler->acquire_event)
-               trace_handler->acquired_event(&acquired_event, event, cpu, timestamp, thread);
+               trace_handler->acquired_event(&acquired_event, sample);
  }
  
-static void
-process_lock_contended_event(void *data,
-                            struct event_format *event __used,
-                            int cpu __used,
-                            u64 timestamp __used,
-                            struct thread *thread __used)
+static void perf_evsel__process_lock_contended(struct perf_evsel *evsel,
+                                              struct perf_sample *sample)
  {
         struct trace_contended_event contended_event;
+       struct event_format *event = evsel->tp_format;
+       void *data = sample->raw_data;
         u64 tmp;                /* this is required for casting... */
  
         tmp = raw_field_value(event, "lockdep_addr", data);
@@ -697,17 +669,15 @@ process_lock_contended_event(void *data,
         contended_event.name = (char *)raw_field_ptr(event, "name", data);
  
         if (trace_handler->acquire_event)
-               trace_handler->contended_event(&contended_event, event, cpu, timestamp, thread);
+               trace_handler->contended_event(&contended_event, sample);
  }
  
-static void
-process_lock_release_event(void *data,
-                          struct event_format *event __used,
-                          int cpu __used,
-                          u64 timestamp __used,
-                          struct thread *thread __used)
+static void perf_evsel__process_lock_release(struct perf_evsel *evsel,
+                                            struct perf_sample *sample)
  {
         struct trace_release_event release_event;
+       struct event_format *event = evsel->tp_format;
+       void *data = sample->raw_data;
         u64 tmp;                /* this is required for casting... */
  
         tmp = raw_field_value(event, "lockdep_addr", data);
@@ -715,26 +685,22 @@ process_lock_release_event(void *data,
         release_event.name = (char *)raw_field_ptr(event, "name", data);
  
         if (trace_handler->acquire_event)
-               trace_handler->release_event(&release_event, event, cpu, timestamp, thread);
+               trace_handler->release_event(&release_event, sample);
  }
  
-static void
-process_raw_event(void *data, int cpu, u64 timestamp, struct thread *thread)
+static void perf_evsel__process_lock_event(struct perf_evsel *evsel,
+                                          struct perf_sample *sample)
  {
-       struct event_format *event;
-       int type;
-
-       type = trace_parse_common_type(session->pevent, data);
-       event = pevent_find_event(session->pevent, type);
+       struct event_format *event = evsel->tp_format;
  
         if (!strcmp(event->name, "lock_acquire"))
-               process_lock_acquire_event(data, event, cpu, timestamp, thread);
+               perf_evsel__process_lock_acquire(evsel, sample);
         if (!strcmp(event->name, "lock_acquired"))
-               process_lock_acquired_event(data, event, cpu, timestamp, thread);
+               perf_evsel__process_lock_acquired(evsel, sample);
         if (!strcmp(event->name, "lock_contended"))
-               process_lock_contended_event(data, event, cpu, timestamp, thread);
+               perf_evsel__process_lock_contended(evsel, sample);
         if (!strcmp(event->name, "lock_release"))
-               process_lock_release_event(data, event, cpu, timestamp, thread);
+               perf_evsel__process_lock_release(evsel, sample);
  }
  
  static void print_bad_events(int bad, int total)
@@ -849,7 +815,7 @@ static void dump_info(void)
  static int process_sample_event(struct perf_tool *tool __used,
                                 union perf_event *event,
                                 struct perf_sample *sample,
-                               struct perf_evsel *evsel __used,
+                               struct perf_evsel *evsel,
                                 struct machine *machine)
  {
         struct thread *thread = machine__findnew_thread(machine, sample->tid);
@@ -860,8 +826,7 @@ static int process_sample_event(struct perf_tool *tool __used,
                 return -1;
         }
  
-       process_raw_event(sample->raw_data, sample->cpu, sample->time, thread);
-
+       perf_evsel__process_lock_event(evsel, sample);
         return 0;
  }
  
@@ -938,16 +903,19 @@ static const struct option lock_options[] = {
         OPT_END()
  };
  
+static const char * const lock_tracepoints[] = {
+       "lock:lock_acquire",    /* CONFIG_LOCKDEP */
+       "lock:lock_acquired",   /* CONFIG_LOCKDEP, CONFIG_LOCK_STAT */
+       "lock:lock_contended",  /* CONFIG_LOCKDEP, CONFIG_LOCK_STAT */
+       "lock:lock_release",    /* CONFIG_LOCKDEP */
+};
+
  static const char *record_args[] = {
         "record",
         "-R",
         "-f",
         "-m", "1024",
         "-c", "1",
-       "-e", "lock:lock_acquire",
-       "-e", "lock:lock_acquired",
-       "-e", "lock:lock_contended",
-       "-e", "lock:lock_release",
  };
  
  static int __cmd_record(int argc, const char **argv)
@@ -955,15 +923,31 @@ static int __cmd_record(int argc, const char **argv)
         unsigned int rec_argc, i, j;
         const char **rec_argv;
  
+       for (i = 0; i < ARRAY_SIZE(lock_tracepoints); i++) {
+               if (!is_valid_tracepoint(lock_tracepoints[i])) {
+                               pr_err("tracepoint %s is not enabled. "
+                                      "Are CONFIG_LOCKDEP and CONFIG_LOCK_STAT enabled?\n",
+                                      lock_tracepoints[i]);
+                               return 1;
+               }
+       }
+
         rec_argc = ARRAY_SIZE(record_args) + argc - 1;
-       rec_argv = calloc(rec_argc + 1, sizeof(char *));
+       /* factor of 2 is for -e in front of each tracepoint */
+       rec_argc += 2 * ARRAY_SIZE(lock_tracepoints);
  
+       rec_argv = calloc(rec_argc + 1, sizeof(char *));
         if (rec_argv == NULL)
                 return -ENOMEM;
  
         for (i = 0; i < ARRAY_SIZE(record_args); i++)
                 rec_argv[i] = strdup(record_args[i]);
  
+       for (j = 0; j < ARRAY_SIZE(lock_tracepoints); j++) {
+               rec_argv[i++] = "-e";
+               rec_argv[i++] = strdup(lock_tracepoints[j]);
+       }
+
         for (j = 1; j < (unsigned int)argc; j++, i++)
                 rec_argv[i] = argv[j];
  
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c

index 4db6e1ba54e30bf780d990a8fb1d7b203e89707f..479ff2a038fccf0dd9e8bcf33dab5e0652fe79df 100644 (file)
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -31,6 +31,15 @@
  #include <sched.h>
  #include <sys/mman.h>
  
+#define CALLCHAIN_HELP "do call-graph (stack chain/backtrace) recording: "
+
+#ifdef NO_LIBUNWIND_SUPPORT
+static char callchain_help[] = CALLCHAIN_HELP "[fp]";
+#else
+static unsigned long default_stack_dump_size = 8192;
+static char callchain_help[] = CALLCHAIN_HELP "[fp] dwarf";
+#endif
+
  enum write_mode_t {
         WRITE_FORCE,
         WRITE_APPEND
@@ -163,12 +172,12 @@ static bool perf_evlist__equal(struct perf_evlist *evlist,
         if (evlist->nr_entries != other->nr_entries)
                 return false;
  
-       pair = list_entry(other->entries.next, struct perf_evsel, node);
+       pair = perf_evlist__first(other);
  
         list_for_each_entry(pos, &evlist->entries, node) {
                 if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
                         return false;
-               pair = list_entry(pair->node.next, struct perf_evsel, node);
+               pair = perf_evsel__next(pair);
         }
  
         return true;
@@ -176,18 +185,18 @@ static bool perf_evlist__equal(struct perf_evlist *evlist,
  
  static void perf_record__open(struct perf_record *rec)
  {
-       struct perf_evsel *pos, *first;
+       struct perf_evsel *pos;
         struct perf_evlist *evlist = rec->evlist;
         struct perf_session *session = rec->session;
         struct perf_record_opts *opts = &rec->opts;
  
-       first = list_entry(evlist->entries.next, struct perf_evsel, node);
-
         perf_evlist__config_attrs(evlist, opts);
  
+       if (opts->group)
+               perf_evlist__set_leader(evlist);
+
         list_for_each_entry(pos, &evlist->entries, node) {
                 struct perf_event_attr *attr = &pos->attr;
-               struct xyarray *group_fd = NULL;
                 /*
                  * Check if parse_single_tracepoint_event has already asked for
                  * PERF_SAMPLE_TIME.
@@ -202,16 +211,13 @@ static void perf_record__open(struct perf_record *rec)
                  */
                 bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
  
-               if (opts->group && pos != first)
-                       group_fd = first->fd;
  fallback_missing_features:
                 if (opts->exclude_guest_missing)
                         attr->exclude_guest = attr->exclude_host = 0;
  retry_sample_id:
                 attr->sample_id_all = opts->sample_id_all_missing ? 0 : 1;
  try_again:
-               if (perf_evsel__open(pos, evlist->cpus, evlist->threads,
-                                    opts->group, group_fd) < 0) {
+               if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) {
                         int err = errno;
  
                         if (err == EPERM || err == EACCES) {
@@ -732,6 +738,106 @@ error:
         return ret;
  }
  
+#ifndef NO_LIBUNWIND_SUPPORT
+static int get_stack_size(char *str, unsigned long *_size)
+{
+       char *endptr;
+       unsigned long size;
+       unsigned long max_size = round_down(USHRT_MAX, sizeof(u64));
+
+       size = strtoul(str, &endptr, 0);
+
+       do {
+               if (*endptr)
+                       break;
+
+               size = round_up(size, sizeof(u64));
+               if (!size || size > max_size)
+                       break;
+
+               *_size = size;
+               return 0;
+
+       } while (0);
+
+       pr_err("callchain: Incorrect stack dump size (max %ld): %s\n",
+              max_size, str);
+       return -1;
+}
+#endif /* !NO_LIBUNWIND_SUPPORT */
+
+static int
+parse_callchain_opt(const struct option *opt __used, const char *arg,
+                   int unset)
+{
+       struct perf_record *rec = (struct perf_record *)opt->value;
+       char *tok, *name, *saveptr = NULL;
+       char *buf;
+       int ret = -1;
+
+       /* --no-call-graph */
+       if (unset)
+               return 0;
+
+       /* We specified default option if none is provided. */
+       BUG_ON(!arg);
+
+       /* We need buffer that we know we can write to. */
+       buf = malloc(strlen(arg) + 1);
+       if (!buf)
+               return -ENOMEM;
+
+       strcpy(buf, arg);
+
+       tok = strtok_r((char *)buf, ",", &saveptr);
+       name = tok ? : (char *)buf;
+
+       do {
+               /* Framepointer style */
+               if (!strncmp(name, "fp", sizeof("fp"))) {
+                       if (!strtok_r(NULL, ",", &saveptr)) {
+                               rec->opts.call_graph = CALLCHAIN_FP;
+                               ret = 0;
+                       } else
+                               pr_err("callchain: No more arguments "
+                                      "needed for -g fp\n");
+                       break;
+
+#ifndef NO_LIBUNWIND_SUPPORT
+               /* Dwarf style */
+               } else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
+                       ret = 0;
+                       rec->opts.call_graph = CALLCHAIN_DWARF;
+                       rec->opts.stack_dump_size = default_stack_dump_size;
+
+                       tok = strtok_r(NULL, ",", &saveptr);
+                       if (tok) {
+                               unsigned long size = 0;
+
+                               ret = get_stack_size(tok, &size);
+                               rec->opts.stack_dump_size = size;
+                       }
+
+                       if (!ret)
+                               pr_debug("callchain: stack dump size %d\n",
+                                        rec->opts.stack_dump_size);
+#endif /* !NO_LIBUNWIND_SUPPORT */
+               } else {
+                       pr_err("callchain: Unknown -g option "
+                              "value: %s\n", arg);
+                       break;
+               }
+
+       } while (0);
+
+       free(buf);
+
+       if (!ret)
+               pr_debug("callchain: type %d\n", rec->opts.call_graph);
+
+       return ret;
+}
+
  static const char * const record_usage[] = {
         "perf record [<options>] [<command>]",
         "perf record [<options>] -- <command> [<options>]",
@@ -803,8 +909,9 @@ const struct option record_options[] = {
                      "number of mmap data pages"),
         OPT_BOOLEAN(0, "group", &record.opts.group,
                     "put the counters into a counter group"),
-       OPT_BOOLEAN('g', "call-graph", &record.opts.call_graph,
-                   "do call-graph (stack chain/backtrace) recording"),
+       OPT_CALLBACK_DEFAULT('g', "call-graph", &record, "mode[,dump_size]",
+                            callchain_help, &parse_callchain_opt,
+                            "fp"),
         OPT_INCR('v', "verbose", &verbose,
                     "be more verbose (show counter open errors, etc)"),
         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c

index 7c88a243b5db04308c9faf2e6aaf32d19bf0bb54..d61825371adc396c5f71f67c3db50118dd5f2e21 100644 (file)
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -69,8 +69,8 @@ static int perf_report__add_branch_hist_entry(struct perf_tool *tool,
  
         if ((sort__has_parent || symbol_conf.use_callchain)
             && sample->callchain) {
-               err = machine__resolve_callchain(machine, al->thread,
-                                                sample->callchain, &parent);
+               err = machine__resolve_callchain(machine, evsel, al->thread,
+                                                sample, &parent);
                 if (err)
                         return err;
         }
@@ -140,8 +140,8 @@ static int perf_evsel__add_hist_entry(struct perf_evsel *evsel,
         struct hist_entry *he;
  
         if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) {
-               err = machine__resolve_callchain(machine, al->thread,
-                                                sample->callchain, &parent);
+               err = machine__resolve_callchain(machine, evsel, al->thread,
+                                                sample, &parent);
                 if (err)
                         return err;
         }
@@ -397,17 +397,17 @@ static int __cmd_report(struct perf_report *rep)
                 desc);
         }
  
-       if (dump_trace) {
-               perf_session__fprintf_nr_events(session, stdout);
-               goto out_delete;
-       }
-
         if (verbose > 3)
                 perf_session__fprintf(session, stdout);
  
         if (verbose > 2)
                 perf_session__fprintf_dsos(session, stdout);
  
+       if (dump_trace) {
+               perf_session__fprintf_nr_events(session, stdout);
+               goto out_delete;
+       }
+
         nr_samples = 0;
         list_for_each_entry(pos, &session->evlist->entries, node) {
                 struct hists *hists = &pos->hists;
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c

index 7a9ad2b1ee7601de26ed2d97ff91fae39a46f185..a25a023965bb2751e3499132a53482a129fbdf10 100644 (file)
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -43,11 +43,6 @@ static u64                   sleep_measurement_overhead;
  
  static unsigned long           nr_tasks;
  
-struct perf_sched {
-       struct perf_tool    tool;
-       struct perf_session *session;
-};
-
  struct sched_atom;
  
  struct task_desc {
@@ -734,46 +729,30 @@ struct trace_sched_handler {
         void (*switch_event)(struct trace_switch_event *,
                              struct machine *,
                              struct event_format *,
-                            int cpu,
-                            u64 timestamp,
-                            struct thread *thread);
+                            struct perf_sample *sample);
  
         void (*runtime_event)(struct trace_runtime_event *,
                               struct machine *,
-                             struct event_format *,
-                             int cpu,
-                             u64 timestamp,
-                             struct thread *thread);
+                             struct perf_sample *sample);
  
         void (*wakeup_event)(struct trace_wakeup_event *,
                              struct machine *,
                              struct event_format *,
-                            int cpu,
-                            u64 timestamp,
-                            struct thread *thread);
+                            struct perf_sample *sample);
  
         void (*fork_event)(struct trace_fork_event *,
-                          struct event_format *,
-                          int cpu,
-                          u64 timestamp,
-                          struct thread *thread);
+                          struct event_format *event);
  
         void (*migrate_task_event)(struct trace_migrate_task_event *,
-                          struct machine *machine,
-                          struct event_format *,
-                          int cpu,
-                          u64 timestamp,
-                          struct thread *thread);
+                                  struct machine *machine,
+                                  struct perf_sample *sample);
  };
  
  
  static void
  replay_wakeup_event(struct trace_wakeup_event *wakeup_event,
                     struct machine *machine __used,
-                   struct event_format *event,
-                   int cpu __used,
-                   u64 timestamp __used,
-                   struct thread *thread __used)
+                   struct event_format *event, struct perf_sample *sample)
  {
         struct task_desc *waker, *wakee;
  
@@ -789,7 +768,7 @@ replay_wakeup_event(struct trace_wakeup_event *wakeup_event,
         waker = register_pid(wakeup_event->common_pid, "<unknown>");
         wakee = register_pid(wakeup_event->pid, wakeup_event->comm);
  
-       add_sched_event_wakeup(waker, timestamp, wakee);
+       add_sched_event_wakeup(waker, sample->time, wakee);
  }
  
  static u64 cpu_last_switched[MAX_CPUS];
@@ -798,12 +777,11 @@ static void
  replay_switch_event(struct trace_switch_event *switch_event,
                     struct machine *machine __used,
                     struct event_format *event,
-                   int cpu,
-                   u64 timestamp,
-                   struct thread *thread __used)
+                   struct perf_sample *sample)
  {
         struct task_desc *prev, __used *next;
-       u64 timestamp0;
+       u64 timestamp0, timestamp = sample->time;
+       int cpu = sample->cpu;
         s64 delta;
  
         if (verbose)
@@ -840,10 +818,7 @@ replay_switch_event(struct trace_switch_event *switch_event,
  
  static void
  replay_fork_event(struct trace_fork_event *fork_event,
-                 struct event_format *event,
-                 int cpu __used,
-                 u64 timestamp __used,
-                 struct thread *thread __used)
+                 struct event_format *event)
  {
         if (verbose) {
                 printf("sched_fork event %p\n", event);
@@ -949,10 +924,7 @@ static void thread_atoms_insert(struct thread *thread)
  
  static void
  latency_fork_event(struct trace_fork_event *fork_event __used,
-                  struct event_format *event __used,
-                  int cpu __used,
-                  u64 timestamp __used,
-                  struct thread *thread __used)
+                  struct event_format *event __used)
  {
         /* should insert the newcomer */
  }
@@ -1032,13 +1004,12 @@ static void
  latency_switch_event(struct trace_switch_event *switch_event,
                      struct machine *machine,
                      struct event_format *event __used,
-                    int cpu,
-                    u64 timestamp,
-                    struct thread *thread __used)
+                    struct perf_sample *sample)
  {
         struct work_atoms *out_events, *in_events;
         struct thread *sched_out, *sched_in;
-       u64 timestamp0;
+       u64 timestamp0, timestamp = sample->time;
+       int cpu = sample->cpu;
         s64 delta;
  
         BUG_ON(cpu >= MAX_CPUS || cpu < 0);
@@ -1083,14 +1054,12 @@ latency_switch_event(struct trace_switch_event *switch_event,
  
  static void
  latency_runtime_event(struct trace_runtime_event *runtime_event,
-                    struct machine *machine,
-                    struct event_format *event __used,
-                    int cpu,
-                    u64 timestamp,
-                    struct thread *this_thread __used)
+                     struct machine *machine, struct perf_sample *sample)
  {
         struct thread *thread = machine__findnew_thread(machine, runtime_event->pid);
         struct work_atoms *atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
+       u64 timestamp = sample->time;
+       int cpu = sample->cpu;
  
         BUG_ON(cpu >= MAX_CPUS || cpu < 0);
         if (!atoms) {
@@ -1106,15 +1075,13 @@ latency_runtime_event(struct trace_runtime_event *runtime_event,
  
  static void
  latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
-                    struct machine *machine,
-                    struct event_format *__event __used,
-                    int cpu __used,
-                    u64 timestamp,
-                    struct thread *thread __used)
+                    struct machine *machine, struct event_format *event __used,
+                    struct perf_sample *sample)
  {
         struct work_atoms *atoms;
         struct work_atom *atom;
         struct thread *wakee;
+       u64 timestamp = sample->time;
  
         /* Note for later, it may be interesting to observe the failing cases */
         if (!wakeup_event->success)
@@ -1154,12 +1121,9 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
  
  static void
  latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event,
-                    struct machine *machine,
-                    struct event_format *__event __used,
-                    int cpu __used,
-                    u64 timestamp,
-                    struct thread *thread __used)
+                          struct machine *machine, struct perf_sample *sample)
  {
+       u64 timestamp = sample->time;
         struct work_atoms *atoms;
         struct work_atom *atom;
         struct thread *migrant;
@@ -1369,7 +1333,7 @@ process_sched_wakeup_event(struct perf_tool *tool __used,
                            struct event_format *event,
                            struct perf_sample *sample,
                            struct machine *machine,
-                          struct thread *thread)
+                          struct thread *thread __used)
  {
         void *data = sample->raw_data;
         struct trace_wakeup_event wakeup_event;
@@ -1383,8 +1347,7 @@ process_sched_wakeup_event(struct perf_tool *tool __used,
         FILL_FIELD(wakeup_event, cpu, event, data);
  
         if (trace_handler->wakeup_event)
-               trace_handler->wakeup_event(&wakeup_event, machine, event,
-                                           sample->cpu, sample->time, thread);
+               trace_handler->wakeup_event(&wakeup_event, machine, event, sample);
  }
  
  /*
@@ -1404,15 +1367,13 @@ static void
  map_switch_event(struct trace_switch_event *switch_event,
                  struct machine *machine,
                  struct event_format *event __used,
-                int this_cpu,
-                u64 timestamp,
-                struct thread *thread __used)
+                struct perf_sample *sample)
  {
         struct thread *sched_out __used, *sched_in;
         int new_shortname;
-       u64 timestamp0;
+       u64 timestamp0, timestamp = sample->time;
         s64 delta;
-       int cpu;
+       int cpu, this_cpu = sample->cpu;
  
         BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
  
@@ -1484,7 +1445,7 @@ process_sched_switch_event(struct perf_tool *tool __used,
                            struct event_format *event,
                            struct perf_sample *sample,
                            struct machine *machine,
-                          struct thread *thread)
+                          struct thread *thread __used)
  {
         int this_cpu = sample->cpu;
         void *data = sample->raw_data;
@@ -1509,8 +1470,7 @@ process_sched_switch_event(struct perf_tool *tool __used,
                         nr_context_switch_bugs++;
         }
         if (trace_handler->switch_event)
-               trace_handler->switch_event(&switch_event, machine, event,
-                                           this_cpu, sample->time, thread);
+               trace_handler->switch_event(&switch_event, machine, event, sample);
  
         curr_pid[this_cpu] = switch_event.next_pid;
  }
@@ -1520,7 +1480,7 @@ process_sched_runtime_event(struct perf_tool *tool __used,
                             struct event_format *event,
                             struct perf_sample *sample,
                             struct machine *machine,
-                           struct thread *thread)
+                           struct thread *thread __used)
  {
         void *data = sample->raw_data;
         struct trace_runtime_event runtime_event;
@@ -1531,8 +1491,7 @@ process_sched_runtime_event(struct perf_tool *tool __used,
         FILL_FIELD(runtime_event, vruntime, event, data);
  
         if (trace_handler->runtime_event)
-               trace_handler->runtime_event(&runtime_event, machine, event,
-                                            sample->cpu, sample->time, thread);
+               trace_handler->runtime_event(&runtime_event, machine, sample);
  }
  
  static void
@@ -1540,7 +1499,7 @@ process_sched_fork_event(struct perf_tool *tool __used,
                          struct event_format *event,
                          struct perf_sample *sample,
                          struct machine *machine __used,
-                        struct thread *thread)
+                        struct thread *thread __used)
  {
         void *data = sample->raw_data;
         struct trace_fork_event fork_event;
@@ -1553,8 +1512,7 @@ process_sched_fork_event(struct perf_tool *tool __used,
         FILL_FIELD(fork_event, child_pid, event, data);
  
         if (trace_handler->fork_event)
-               trace_handler->fork_event(&fork_event, event,
-                                         sample->cpu, sample->time, thread);
+               trace_handler->fork_event(&fork_event, event);
  }
  
  static void
@@ -1573,7 +1531,7 @@ process_sched_migrate_task_event(struct perf_tool *tool __used,
                                  struct event_format *event,
                                  struct perf_sample *sample,
                                  struct machine *machine,
-                                struct thread *thread)
+                                struct thread *thread __used)
  {
         void *data = sample->raw_data;
         struct trace_migrate_task_event migrate_task_event;
@@ -1586,9 +1544,7 @@ process_sched_migrate_task_event(struct perf_tool *tool __used,
         FILL_FIELD(migrate_task_event, cpu, event, data);
  
         if (trace_handler->migrate_task_event)
-               trace_handler->migrate_task_event(&migrate_task_event, machine,
-                                                 event, sample->cpu,
-                                                 sample->time, thread);
+               trace_handler->migrate_task_event(&migrate_task_event, machine, sample);
  }
  
  typedef void (*tracepoint_handler)(struct perf_tool *tool, struct event_format *event,
@@ -1596,14 +1552,12 @@ typedef void (*tracepoint_handler)(struct perf_tool *tool, struct event_format *
                                    struct machine *machine,
                                    struct thread *thread);
  
-static int perf_sched__process_tracepoint_sample(struct perf_tool *tool,
+static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __used,
                                                  union perf_event *event __used,
                                                  struct perf_sample *sample,
                                                  struct perf_evsel *evsel,
                                                  struct machine *machine)
  {
-       struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
-       struct pevent *pevent = sched->session->pevent;
         struct thread *thread = machine__findnew_thread(machine, sample->pid);
  
         if (thread == NULL) {
@@ -1617,25 +1571,18 @@ static int perf_sched__process_tracepoint_sample(struct perf_tool *tool,
  
         if (evsel->handler.func != NULL) {
                 tracepoint_handler f = evsel->handler.func;
-
-               if (evsel->handler.data == NULL)
-                       evsel->handler.data = pevent_find_event(pevent,
-                                                         evsel->attr.config);
-
-               f(tool, evsel->handler.data, sample, machine, thread);
+               f(tool, evsel->tp_format, sample, machine, thread);
         }
  
         return 0;
  }
  
-static struct perf_sched perf_sched = {
-       .tool = {
-               .sample          = perf_sched__process_tracepoint_sample,
-               .comm            = perf_event__process_comm,
-               .lost            = perf_event__process_lost,
-               .fork            = perf_event__process_task,
-               .ordered_samples = true,
-       },
+static struct perf_tool perf_sched = {
+       .sample          = perf_sched__process_tracepoint_sample,
+       .comm            = perf_event__process_comm,
+       .lost            = perf_event__process_lost,
+       .fork            = perf_event__process_task,
+       .ordered_samples = true,
  };
  
  static void read_events(bool destroy, struct perf_session **psession)
@@ -1652,18 +1599,15 @@ static void read_events(bool destroy, struct perf_session **psession)
         };
         struct perf_session *session;
  
-       session = perf_session__new(input_name, O_RDONLY, 0, false,
-                                   &perf_sched.tool);
+       session = perf_session__new(input_name, O_RDONLY, 0, false, &perf_sched);
         if (session == NULL)
                 die("No Memory");
  
-       perf_sched.session = session;
-
         err = perf_session__set_tracepoints_handlers(session, handlers);
         assert(err == 0);
  
         if (perf_session__has_traces(session, "record -R")) {
-               err = perf_session__process_events(session, &perf_sched.tool);
+               err = perf_session__process_events(session, &perf_sched);
                 if (err)
                         die("Failed to process events, error %d", err);
  
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c

index 1e60ab70b2b14789b17a2a7199f5a8d175709242..2d6e3b226aad511ef04ba5d0bd20bf039c7e1ddb 100644 (file)
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -28,11 +28,6 @@ static bool                  system_wide;
  static const char              *cpu_list;
  static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
  
-struct perf_script {
-       struct perf_tool    tool;
-       struct perf_session *session;
-};
-
  enum perf_output_field {
         PERF_OUTPUT_COMM            = 1U << 0,
         PERF_OUTPUT_TID             = 1U << 1,
@@ -262,14 +257,11 @@ static int perf_session__check_output_opt(struct perf_session *session)
         return 0;
  }
  
-static void print_sample_start(struct pevent *pevent,
-                              struct perf_sample *sample,
+static void print_sample_start(struct perf_sample *sample,
                                struct thread *thread,
                                struct perf_evsel *evsel)
  {
-       int type;
         struct perf_event_attr *attr = &evsel->attr;
-       struct event_format *event;
         const char *evname = NULL;
         unsigned long secs;
         unsigned long usecs;
@@ -307,20 +299,7 @@ static void print_sample_start(struct pevent *pevent,
         }
  
         if (PRINT_FIELD(EVNAME)) {
-               if (attr->type == PERF_TYPE_TRACEPOINT) {
-                       /*
-                        * XXX Do we really need this here?
-                        * perf_evlist__set_tracepoint_names should have done
-                        * this already
-                        */
-                       type = trace_parse_common_type(pevent,
-                                                      sample->raw_data);
-                       event = pevent_find_event(pevent, type);
-                       if (event)
-                               evname = event->name;
-               } else
-                       evname = perf_evsel__name(evsel);
-
+               evname = perf_evsel__name(evsel);
                 printf("%s: ", evname ? evname : "[unknown]");
         }
  }
@@ -401,7 +380,7 @@ static void print_sample_bts(union perf_event *event,
                         printf(" ");
                 else
                         printf("\n");
-               perf_event__print_ip(event, sample, machine,
+               perf_evsel__print_ip(evsel, event, sample, machine,
                                      PRINT_FIELD(SYM), PRINT_FIELD(DSO),
                                      PRINT_FIELD(SYMOFFSET));
         }
@@ -415,19 +394,17 @@ static void print_sample_bts(union perf_event *event,
         printf("\n");
  }
  
-static void process_event(union perf_event *event __unused,
-                         struct pevent *pevent,
-                         struct perf_sample *sample,
-                         struct perf_evsel *evsel,
-                         struct machine *machine,
-                         struct thread *thread)
+static void process_event(union perf_event *event, struct perf_sample *sample,
+                         struct perf_evsel *evsel, struct machine *machine,
+                         struct addr_location *al)
  {
         struct perf_event_attr *attr = &evsel->attr;
+       struct thread *thread = al->thread;
  
         if (output[attr->type].fields == 0)
                 return;
  
-       print_sample_start(pevent, sample, thread, evsel);
+       print_sample_start(sample, thread, evsel);
  
         if (is_bts_event(attr)) {
                 print_sample_bts(event, sample, evsel, machine, thread);
@@ -435,9 +412,8 @@ static void process_event(union perf_event *event __unused,
         }
  
         if (PRINT_FIELD(TRACE))
-               print_trace_event(pevent, sample->cpu, sample->raw_data,
-                                 sample->raw_size);
-
+               event_format__print(evsel->tp_format, sample->cpu,
+                                   sample->raw_data, sample->raw_size);
         if (PRINT_FIELD(ADDR))
                 print_sample_addr(event, sample, machine, thread, attr);
  
@@ -446,7 +422,7 @@ static void process_event(union perf_event *event __unused,
                         printf(" ");
                 else
                         printf("\n");
-               perf_event__print_ip(event, sample, machine,
+               perf_evsel__print_ip(evsel, event, sample, machine,
                                      PRINT_FIELD(SYM), PRINT_FIELD(DSO),
                                      PRINT_FIELD(SYMOFFSET));
         }
@@ -505,7 +481,6 @@ static int process_sample_event(struct perf_tool *tool __used,
                                 struct machine *machine)
  {
         struct addr_location al;
-       struct perf_script *scr = container_of(tool, struct perf_script, tool);
         struct thread *thread = machine__findnew_thread(machine, event->ip.tid);
  
         if (thread == NULL) {
@@ -537,27 +512,24 @@ static int process_sample_event(struct perf_tool *tool __used,
         if (cpu_list && !test_bit(sample->cpu, cpu_bitmap))
                 return 0;
  
-       scripting_ops->process_event(event, scr->session->pevent,
-                                    sample, evsel, machine, thread);
+       scripting_ops->process_event(event, sample, evsel, machine, &al);
  
         evsel->hists.stats.total_period += sample->period;
         return 0;
  }
  
-static struct perf_script perf_script = {
-       .tool = {
-               .sample          = process_sample_event,
-               .mmap            = perf_event__process_mmap,
-               .comm            = perf_event__process_comm,
-               .exit            = perf_event__process_task,
-               .fork            = perf_event__process_task,
-               .attr            = perf_event__process_attr,
-               .event_type      = perf_event__process_event_type,
-               .tracing_data    = perf_event__process_tracing_data,
-               .build_id        = perf_event__process_build_id,
-               .ordered_samples = true,
-               .ordering_requires_timestamps = true,
-       },
+static struct perf_tool perf_script = {
+       .sample          = process_sample_event,
+       .mmap            = perf_event__process_mmap,
+       .comm            = perf_event__process_comm,
+       .exit            = perf_event__process_task,
+       .fork            = perf_event__process_task,
+       .attr            = perf_event__process_attr,
+       .event_type      = perf_event__process_event_type,
+       .tracing_data    = perf_event__process_tracing_data,
+       .build_id        = perf_event__process_build_id,
+       .ordered_samples = true,
+       .ordering_requires_timestamps = true,
  };
  
  extern volatile int session_done;
@@ -573,7 +545,7 @@ static int __cmd_script(struct perf_session *session)
  
         signal(SIGINT, sig_handler);
  
-       ret = perf_session__process_events(session, &perf_script.tool);
+       ret = perf_session__process_events(session, &perf_script);
  
         if (debug_mode)
                 pr_err("Misordered timestamps: %" PRIu64 "\n", nr_unordered);
@@ -1356,12 +1328,10 @@ int cmd_script(int argc, const char **argv, const char *prefix __used)
                 setup_pager();
  
         session = perf_session__new(input_name, O_RDONLY, 0, false,
-                                   &perf_script.tool);
+                                   &perf_script);
         if (session == NULL)
                 return -ENOMEM;
  
-       perf_script.session = session;
-
         if (cpu_list) {
                 if (perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap))
                         return -1;
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c

index 861f0aec77aeacf099653d87ca9c6df9b6f2787c..d53d8ab099b17f43e77a241442b2bf211ecd4d10 100644 (file)
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -281,13 +281,9 @@ static int create_perf_stat_counter(struct perf_evsel *evsel,
                                     struct perf_evsel *first)
  {
         struct perf_event_attr *attr = &evsel->attr;
-       struct xyarray *group_fd = NULL;
         bool exclude_guest_missing = false;
         int ret;
  
-       if (group && evsel != first)
-               group_fd = first->fd;
-
         if (scale)
                 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
                                     PERF_FORMAT_TOTAL_TIME_RUNNING;
@@ -299,8 +295,7 @@ retry:
                 evsel->attr.exclude_guest = evsel->attr.exclude_host = 0;
  
         if (perf_target__has_cpu(&target)) {
-               ret = perf_evsel__open_per_cpu(evsel, evsel_list->cpus,
-                                              group, group_fd);
+               ret = perf_evsel__open_per_cpu(evsel, evsel_list->cpus);
                 if (ret)
                         goto check_ret;
                 return 0;
@@ -311,8 +306,7 @@ retry:
                 attr->enable_on_exec = 1;
         }
  
-       ret = perf_evsel__open_per_thread(evsel, evsel_list->threads,
-                                         group, group_fd);
+       ret = perf_evsel__open_per_thread(evsel, evsel_list->threads);
         if (!ret)
                 return 0;
         /* fall through */
@@ -483,7 +477,10 @@ static int run_perf_stat(int argc __used, const char **argv)
                 close(child_ready_pipe[0]);
         }
  
-       first = list_entry(evsel_list->entries.next, struct perf_evsel, node);
+       if (group)
+               perf_evlist__set_leader(evsel_list);
+
+       first = perf_evlist__first(evsel_list);
  
         list_for_each_entry(counter, &evsel_list->entries, node) {
                 if (create_perf_stat_counter(counter, first) < 0) {
diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c

index 1d592f5cbea9776fcc437e012dc20c1911fd7137..381d5ab87124caf03c0a7caec19eb78961f60958 100644 (file)
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -294,7 +294,7 @@ static int test__open_syscall_event(void)
                 goto out_thread_map_delete;
         }
  
-       if (perf_evsel__open_per_thread(evsel, threads, false, NULL) < 0) {
+       if (perf_evsel__open_per_thread(evsel, threads) < 0) {
                 pr_debug("failed to open counter: %s, "
                          "tweak /proc/sys/kernel/perf_event_paranoid?\n",
                          strerror(errno));
@@ -369,7 +369,7 @@ static int test__open_syscall_event_on_all_cpus(void)
                 goto out_thread_map_delete;
         }
  
-       if (perf_evsel__open(evsel, cpus, threads, false, NULL) < 0) {
+       if (perf_evsel__open(evsel, cpus, threads) < 0) {
                 pr_debug("failed to open counter: %s, "
                          "tweak /proc/sys/kernel/perf_event_paranoid?\n",
                          strerror(errno));
@@ -533,7 +533,7 @@ static int test__basic_mmap(void)
  
                 perf_evlist__add(evlist, evsels[i]);
  
-               if (perf_evsel__open(evsels[i], cpus, threads, false, NULL) < 0) {
+               if (perf_evsel__open(evsels[i], cpus, threads) < 0) {
                         pr_debug("failed to open counter: %s, "
                                  "tweak /proc/sys/kernel/perf_event_paranoid?\n",
                                  strerror(errno));
@@ -710,7 +710,7 @@ static int test__PERF_RECORD(void)
         /*
          * Config the evsels, setting attr->comm on the first one, etc.
          */
-       evsel = list_entry(evlist->entries.next, struct perf_evsel, node);
+       evsel = perf_evlist__first(evlist);
         evsel->attr.sample_type |= PERF_SAMPLE_CPU;
         evsel->attr.sample_type |= PERF_SAMPLE_TID;
         evsel->attr.sample_type |= PERF_SAMPLE_TIME;
@@ -737,7 +737,7 @@ static int test__PERF_RECORD(void)
          * Call sys_perf_event_open on all the fds on all the evsels,
          * grouping them if asked to.
          */
-       err = perf_evlist__open(evlist, opts.group);
+       err = perf_evlist__open(evlist);
         if (err < 0) {
                 pr_debug("perf_evlist__open: %s\n", strerror(errno));
                 goto out_delete_evlist;
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c

index 68cd61ef6ac5440c57579e42a935261d440e20f2..0513aaa659f90010f91f2364baf7c853006e7035 100644 (file)
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -509,7 +509,7 @@ static void perf_top__handle_keypress(struct perf_top *top, int c)
                                 prompt_integer(&counter, "Enter details event counter");
  
                                 if (counter >= top->evlist->nr_entries) {
-                                       top->sym_evsel = list_entry(top->evlist->entries.next, struct perf_evsel, node);
+                                       top->sym_evsel = perf_evlist__first(top->evlist);
                                         fprintf(stderr, "Sorry, no such event, using %s.\n", perf_evsel__name(top->sym_evsel));
                                         sleep(1);
                                         break;
@@ -518,7 +518,7 @@ static void perf_top__handle_keypress(struct perf_top *top, int c)
                                         if (top->sym_evsel->idx == counter)
                                                 break;
                         } else
-                               top->sym_evsel = list_entry(top->evlist->entries.next, struct perf_evsel, node);
+                               top->sym_evsel = perf_evlist__first(top->evlist);
                         break;
                 case 'f':
                         prompt_integer(&top->count_filter, "Enter display event count filter");
@@ -783,8 +783,10 @@ static void perf_event__process_sample(struct perf_tool *tool,
  
                 if ((sort__has_parent || symbol_conf.use_callchain) &&
                     sample->callchain) {
-                       err = machine__resolve_callchain(machine, al.thread,
-                                                        sample->callchain, &parent);
+                       err = machine__resolve_callchain(machine, evsel,
+                                                        al.thread, sample,
+                                                        &parent);
+
                         if (err)
                                 return;
                 }
@@ -884,17 +886,14 @@ static void perf_top__mmap_read(struct perf_top *top)
  
  static void perf_top__start_counters(struct perf_top *top)
  {
-       struct perf_evsel *counter, *first;
+       struct perf_evsel *counter;
         struct perf_evlist *evlist = top->evlist;
  
-       first = list_entry(evlist->entries.next, struct perf_evsel, node);
+       if (top->group)
+               perf_evlist__set_leader(evlist);
  
         list_for_each_entry(counter, &evlist->entries, node) {
                 struct perf_event_attr *attr = &counter->attr;
-               struct xyarray *group_fd = NULL;
-
-               if (top->group && counter != first)
-                       group_fd = first->fd;
  
                 attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
  
@@ -925,8 +924,7 @@ retry_sample_id:
                 attr->sample_id_all = top->sample_id_all_missing ? 0 : 1;
  try_again:
                 if (perf_evsel__open(counter, top->evlist->cpus,
-                                    top->evlist->threads, top->group,
-                                    group_fd) < 0) {
+                                    top->evlist->threads) < 0) {
                         int err = errno;
  
                         if (err == EPERM || err == EACCES) {
@@ -1328,7 +1326,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
                         pos->attr.sample_period = top.default_interval;
         }
  
-       top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node);
+       top.sym_evsel = perf_evlist__first(top.evlist);
  
         symbol_conf.priv_size = sizeof(struct annotation);
  
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt

index d695fe40fbff3ff3f442d6a8ced39746fa668695..0303ec69227490ae630c8265be5001b8a51a1422 100644 (file)
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -18,7 +18,7 @@ perf-stat                     mainporcelain common
  perf-timechart                 mainporcelain common
  perf-top                       mainporcelain common
  perf-script                    mainporcelain common
-perf-probe                     mainporcelain common
+perf-probe                     mainporcelain full
  perf-kmem                      mainporcelain common
  perf-lock                      mainporcelain common
  perf-kvm                       mainporcelain common
diff --git a/tools/perf/config/feature-tests.mak b/tools/perf/config/feature-tests.mak

index 6c18785a6417986dabe47f775b0fd07067447012..2f1156a62ab7c1cf97aadc8631257ba88032e916 100644 (file)
--- a/tools/perf/config/feature-tests.mak
+++ b/tools/perf/config/feature-tests.mak
@@ -154,3 +154,28 @@ int main(void)
         return 0;
  }
  endef
+
+ifndef NO_LIBUNWIND
+define SOURCE_LIBUNWIND
+#include <libunwind.h>
+#include <stdlib.h>
+
+extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
+                                      unw_word_t ip,
+                                      unw_dyn_info_t *di,
+                                      unw_proc_info_t *pi,
+                                      int need_unwind_info, void *arg);
+
+
+#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
+
+int main(void)
+{
+       unw_addr_space_t addr_space;
+       addr_space = unw_create_addr_space(NULL, 0);
+       unw_init_remote(NULL, addr_space, NULL);
+       dwarf_search_unwind_table(addr_space, 0, NULL, NULL, 0, NULL);
+       return 0;
+}
+endef
+endif
diff --git a/tools/perf/perf.c b/tools/perf/perf.c

index 2b2e225a4d4c6a655690b30b622669a597521570..e7840e500715cbeda2ec50ff1d875310b57a2226 100644 (file)
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -24,6 +24,39 @@ const char perf_more_info_string[] =
  int use_browser = -1;
  static int use_pager = -1;
  
+struct cmd_struct {
+       const char *cmd;
+       int (*fn)(int, const char **, const char *);
+       int option;
+};
+
+static struct cmd_struct commands[] = {
+       { "buildid-cache", cmd_buildid_cache, 0 },
+       { "buildid-list", cmd_buildid_list, 0 },
+       { "diff",       cmd_diff,       0 },
+       { "evlist",     cmd_evlist,     0 },
+       { "help",       cmd_help,       0 },
+       { "list",       cmd_list,       0 },
+       { "record",     cmd_record,     0 },
+       { "report",     cmd_report,     0 },
+       { "bench",      cmd_bench,      0 },
+       { "stat",       cmd_stat,       0 },
+       { "timechart",  cmd_timechart,  0 },
+       { "top",        cmd_top,        0 },
+       { "annotate",   cmd_annotate,   0 },
+       { "version",    cmd_version,    0 },
+       { "script",     cmd_script,     0 },
+       { "sched",      cmd_sched,      0 },
+#ifndef NO_LIBELF_SUPPORT
+       { "probe",      cmd_probe,      0 },
+#endif
+       { "kmem",       cmd_kmem,       0 },
+       { "lock",       cmd_lock,       0 },
+       { "kvm",        cmd_kvm,        0 },
+       { "test",       cmd_test,       0 },
+       { "inject",     cmd_inject,     0 },
+};
+
  struct pager_config {
         const char *cmd;
         int val;
@@ -160,6 +193,14 @@ static int handle_options(const char ***argv, int *argc, int *envchanged)
                         fprintf(stderr, "dir: %s\n", debugfs_mountpoint);
                         if (envchanged)
                                 *envchanged = 1;
+               } else if (!strcmp(cmd, "--list-cmds")) {
+                       unsigned int i;
+
+                       for (i = 0; i < ARRAY_SIZE(commands); i++) {
+                               struct cmd_struct *p = commands+i;
+                               printf("%s ", p->cmd);
+                       }
+                       exit(0);
                 } else {
                         fprintf(stderr, "Unknown option: %s\n", cmd);
                         usage(perf_usage_string);
@@ -245,12 +286,6 @@ const char perf_version_string[] = PERF_VERSION;
   */
  #define NEED_WORK_TREE (1<<2)
  
-struct cmd_struct {
-       const char *cmd;
-       int (*fn)(int, const char **, const char *);
-       int option;
-};
-
  static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
  {
         int status;
@@ -296,30 +331,6 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
  static void handle_internal_command(int argc, const char **argv)
  {
         const char *cmd = argv[0];
-       static struct cmd_struct commands[] = {
-               { "buildid-cache", cmd_buildid_cache, 0 },
-               { "buildid-list", cmd_buildid_list, 0 },
-               { "diff",       cmd_diff,       0 },
-               { "evlist",     cmd_evlist,     0 },
-               { "help",       cmd_help,       0 },
-               { "list",       cmd_list,       0 },
-               { "record",     cmd_record,     0 },
-               { "report",     cmd_report,     0 },
-               { "bench",      cmd_bench,      0 },
-               { "stat",       cmd_stat,       0 },
-               { "timechart",  cmd_timechart,  0 },
-               { "top",        cmd_top,        0 },
-               { "annotate",   cmd_annotate,   0 },
-               { "version",    cmd_version,    0 },
-               { "script",     cmd_script,     0 },
-               { "sched",      cmd_sched,      0 },
-               { "probe",      cmd_probe,      0 },
-               { "kmem",       cmd_kmem,       0 },
-               { "lock",       cmd_lock,       0 },
-               { "kvm",        cmd_kvm,        0 },
-               { "test",       cmd_test,       0 },
-               { "inject",     cmd_inject,     0 },
-       };
         unsigned int i;
         static const char ext[] = STRIP_EXTENSION;
  
diff --git a/tools/perf/perf.h b/tools/perf/perf.h

index f960ccb2edc6f38f8a7b7a3f0f1b740cbd351c2b..87f4ec6d1f367d76791855c58f25b432d0b769f2 100644 (file)
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -209,9 +209,15 @@ void pthread__unblock_sigwinch(void);
  
  #include "util/target.h"
  
+enum perf_call_graph_mode {
+       CALLCHAIN_NONE,
+       CALLCHAIN_FP,
+       CALLCHAIN_DWARF
+};
+
  struct perf_record_opts {
         struct perf_target target;
-       bool         call_graph;
+       int          call_graph;
         bool         group;
         bool         inherit_stat;
         bool         no_delay;
@@ -230,6 +236,7 @@ struct perf_record_opts {
         u64          branch_stack;
         u64          default_interval;
         u64          user_interval;
+       u16          stack_dump_size;
  };
  
  #endif
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/EventClass.py b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/EventClass.py

new file mode 100755 (executable)

index 0000000..9e09857
--- /dev/null
+++ b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/EventClass.py
@@ -0,0 +1,94 @@
+# EventClass.py
+#
+# This is a library defining some events types classes, which could
+# be used by other scripts to analyzing the perf samples.
+#
+# Currently there are just a few classes defined for examples,
+# PerfEvent is the base class for all perf event sample, PebsEvent
+# is a HW base Intel x86 PEBS event, and user could add more SW/HW
+# event classes based on requirements.
+
+import struct
+
+# Event types, user could add more here
+EVTYPE_GENERIC  = 0
+EVTYPE_PEBS     = 1     # Basic PEBS event
+EVTYPE_PEBS_LL  = 2     # PEBS event with load latency info
+EVTYPE_IBS      = 3
+
+#
+# Currently we don't have good way to tell the event type, but by
+# the size of raw buffer, raw PEBS event with load latency data's
+# size is 176 bytes, while the pure PEBS event's size is 144 bytes.
+#
+def create_event(name, comm, dso, symbol, raw_buf):
+        if (len(raw_buf) == 144):
+                event = PebsEvent(name, comm, dso, symbol, raw_buf)
+        elif (len(raw_buf) == 176):
+                event = PebsNHM(name, comm, dso, symbol, raw_buf)
+        else:
+                event = PerfEvent(name, comm, dso, symbol, raw_buf)
+
+        return event
+
+class PerfEvent(object):
+        event_num = 0
+        def __init__(self, name, comm, dso, symbol, raw_buf, ev_type=EVTYPE_GENERIC):
+                self.name       = name
+                self.comm       = comm
+                self.dso        = dso
+                self.symbol     = symbol
+                self.raw_buf    = raw_buf
+                self.ev_type    = ev_type
+                PerfEvent.event_num += 1
+
+        def show(self):
+                print "PMU event: name=%12s, symbol=%24s, comm=%8s, dso=%12s" % (self.name, self.symbol, self.comm, self.dso)
+
+#
+# Basic Intel PEBS (Precise Event-based Sampling) event, whose raw buffer
+# contains the context info when that event happened: the EFLAGS and
+# linear IP info, as well as all the registers.
+#
+class PebsEvent(PerfEvent):
+        pebs_num = 0
+        def __init__(self, name, comm, dso, symbol, raw_buf, ev_type=EVTYPE_PEBS):
+                tmp_buf=raw_buf[0:80]
+                flags, ip, ax, bx, cx, dx, si, di, bp, sp = struct.unpack('QQQQQQQQQQ', tmp_buf)
+                self.flags = flags
+                self.ip    = ip
+                self.ax    = ax
+                self.bx    = bx
+                self.cx    = cx
+                self.dx    = dx
+                self.si    = si
+                self.di    = di
+                self.bp    = bp
+                self.sp    = sp
+
+                PerfEvent.__init__(self, name, comm, dso, symbol, raw_buf, ev_type)
+                PebsEvent.pebs_num += 1
+                del tmp_buf
+
+#
+# Intel Nehalem and Westmere support PEBS plus Load Latency info which lie
+# in the four 64 bit words write after the PEBS data:
+#       Status: records the IA32_PERF_GLOBAL_STATUS register value
+#       DLA:    Data Linear Address (EIP)
+#       DSE:    Data Source Encoding, where the latency happens, hit or miss
+#               in L1/L2/L3 or IO operations
+#       LAT:    the actual latency in cycles
+#
+class PebsNHM(PebsEvent):
+        pebs_nhm_num = 0
+        def __init__(self, name, comm, dso, symbol, raw_buf, ev_type=EVTYPE_PEBS_LL):
+                tmp_buf=raw_buf[144:176]
+                status, dla, dse, lat = struct.unpack('QQQQ', tmp_buf)
+                self.status = status
+                self.dla = dla
+                self.dse = dse
+                self.lat = lat
+
+                PebsEvent.__init__(self, name, comm, dso, symbol, raw_buf, ev_type)
+                PebsNHM.pebs_nhm_num += 1
+                del tmp_buf
diff --git a/tools/perf/scripts/python/event_analyzing_sample.py b/tools/perf/scripts/python/event_analyzing_sample.py

new file mode 100644 (file)

index 0000000..163c39f
--- /dev/null
+++ b/tools/perf/scripts/python/event_analyzing_sample.py
@@ -0,0 +1,189 @@
+# event_analyzing_sample.py: general event handler in python
+#
+# Current perf report is already very powerful with the annotation integrated,
+# and this script is not trying to be as powerful as perf report, but
+# providing end user/developer a flexible way to analyze the events other
+# than trace points.
+#
+# The 2 database related functions in this script just show how to gather
+# the basic information, and users can modify and write their own functions
+# according to their specific requirement.
+#
+# The first function "show_general_events" just does a basic grouping for all
+# generic events with the help of sqlite, and the 2nd one "show_pebs_ll" is
+# for a x86 HW PMU event: PEBS with load latency data.
+#
+
+import os
+import sys
+import math
+import struct
+import sqlite3
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+        '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from EventClass import *
+
+#
+# If the perf.data has a big number of samples, then the insert operation
+# will be very time consuming (about 10+ minutes for 10000 samples) if the
+# .db database is on disk. Move the .db file to RAM based FS to speedup
+# the handling, which will cut the time down to several seconds.
+#
+con = sqlite3.connect("/dev/shm/perf.db")
+con.isolation_level = None
+
+def trace_begin():
+       print "In trace_begin:\n"
+
+        #
+        # Will create several tables at the start, pebs_ll is for PEBS data with
+        # load latency info, while gen_events is for general event.
+        #
+        con.execute("""
+                create table if not exists gen_events (
+                        name text,
+                        symbol text,
+                        comm text,
+                        dso text
+                );""")
+        con.execute("""
+                create table if not exists pebs_ll (
+                        name text,
+                        symbol text,
+                        comm text,
+                        dso text,
+                        flags integer,
+                        ip integer,
+                        status integer,
+                        dse integer,
+                        dla integer,
+                        lat integer
+                );""")
+
+#
+# Create and insert event object to a database so that user could
+# do more analysis with simple database commands.
+#
+def process_event(param_dict):
+        event_attr = param_dict["attr"]
+        sample     = param_dict["sample"]
+        raw_buf    = param_dict["raw_buf"]
+        comm       = param_dict["comm"]
+        name       = param_dict["ev_name"]
+
+        # Symbol and dso info are not always resolved
+        if (param_dict.has_key("dso")):
+                dso = param_dict["dso"]
+        else:
+                dso = "Unknown_dso"
+
+        if (param_dict.has_key("symbol")):
+                symbol = param_dict["symbol"]
+        else:
+                symbol = "Unknown_symbol"
+
+        # Create the event object and insert it to the right table in database
+        event = create_event(name, comm, dso, symbol, raw_buf)
+        insert_db(event)
+
+def insert_db(event):
+        if event.ev_type == EVTYPE_GENERIC:
+                con.execute("insert into gen_events values(?, ?, ?, ?)",
+                                (event.name, event.symbol, event.comm, event.dso))
+        elif event.ev_type == EVTYPE_PEBS_LL:
+                event.ip &= 0x7fffffffffffffff
+                event.dla &= 0x7fffffffffffffff
+                con.execute("insert into pebs_ll values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
+                        (event.name, event.symbol, event.comm, event.dso, event.flags,
+                                event.ip, event.status, event.dse, event.dla, event.lat))
+
+def trace_end():
+       print "In trace_end:\n"
+        # We show the basic info for the 2 type of event classes
+        show_general_events()
+        show_pebs_ll()
+        con.close()
+
+#
+# As the event number may be very big, so we can't use linear way
+# to show the histogram in real number, but use a log2 algorithm.
+#
+
+def num2sym(num):
+        # Each number will have at least one '#'
+        snum = '#' * (int)(math.log(num, 2) + 1)
+        return snum
+
+def show_general_events():
+
+        # Check the total record number in the table
+        count = con.execute("select count(*) from gen_events")
+        for t in count:
+                print "There is %d records in gen_events table" % t[0]
+                if t[0] == 0:
+                        return
+
+        print "Statistics about the general events grouped by thread/symbol/dso: \n"
+
+         # Group by thread
+        commq = con.execute("select comm, count(comm) from gen_events group by comm order by -count(comm)")
+        print "\n%16s %8s %16s\n%s" % ("comm", "number", "histogram", "="*42)
+        for row in commq:
+             print "%16s %8d     %s" % (row[0], row[1], num2sym(row[1]))
+
+        # Group by symbol
+        print "\n%32s %8s %16s\n%s" % ("symbol", "number", "histogram", "="*58)
+        symbolq = con.execute("select symbol, count(symbol) from gen_events group by symbol order by -count(symbol)")
+        for row in symbolq:
+             print "%32s %8d     %s" % (row[0], row[1], num2sym(row[1]))
+
+        # Group by dso
+        print "\n%40s %8s %16s\n%s" % ("dso", "number", "histogram", "="*74)
+        dsoq = con.execute("select dso, count(dso) from gen_events group by dso order by -count(dso)")
+        for row in dsoq:
+             print "%40s %8d     %s" % (row[0], row[1], num2sym(row[1]))
+
+#
+# This function just shows the basic info, and we could do more with the
+# data in the tables, like checking the function parameters when some
+# big latency events happen.
+#
+def show_pebs_ll():
+
+        count = con.execute("select count(*) from pebs_ll")
+        for t in count:
+                print "There is %d records in pebs_ll table" % t[0]
+                if t[0] == 0:
+                        return
+
+        print "Statistics about the PEBS Load Latency events grouped by thread/symbol/dse/latency: \n"
+
+        # Group by thread
+        commq = con.execute("select comm, count(comm) from pebs_ll group by comm order by -count(comm)")
+        print "\n%16s %8s %16s\n%s" % ("comm", "number", "histogram", "="*42)
+        for row in commq:
+             print "%16s %8d     %s" % (row[0], row[1], num2sym(row[1]))
+
+        # Group by symbol
+        print "\n%32s %8s %16s\n%s" % ("symbol", "number", "histogram", "="*58)
+        symbolq = con.execute("select symbol, count(symbol) from pebs_ll group by symbol order by -count(symbol)")
+        for row in symbolq:
+             print "%32s %8d     %s" % (row[0], row[1], num2sym(row[1]))
+
+        # Group by dse
+        dseq = con.execute("select dse, count(dse) from pebs_ll group by dse order by -count(dse)")
+        print "\n%32s %8s %16s\n%s" % ("dse", "number", "histogram", "="*58)
+        for row in dseq:
+             print "%32s %8d     %s" % (row[0], row[1], num2sym(row[1]))
+
+        # Group by latency
+        latq = con.execute("select lat, count(lat) from pebs_ll group by lat order by lat")
+        print "\n%32s %8s %16s\n%s" % ("latency", "number", "histogram", "="*58)
+        for row in latq:
+             print "%32s %8d     %s" % (row[0], row[1], num2sym(row[1]))
+
+def trace_unhandled(event_name, context, event_fields_dict):
+               print ' '.join(['%s=%s'%(k,str(v))for k,v in sorted(event_fields_dict.items())])
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c

index 413bd62eedb14ab0a2a023b6ef8f654235e59cda..81bd8c2af7302948a7353022ccb28810733c73a6 100644 (file)
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -24,6 +24,7 @@ struct hist_browser {
         struct hist_entry   *he_selection;
         struct map_symbol   *selection;
         int                  print_seq;
+       bool                 show_dso;
         bool                 has_symbols;
  };
  
@@ -376,12 +377,19 @@ out:
  }
  
  static char *callchain_list__sym_name(struct callchain_list *cl,
-                                     char *bf, size_t bfsize)
+                                     char *bf, size_t bfsize, bool show_dso)
  {
+       int printed;
+
         if (cl->ms.sym)
-               return cl->ms.sym->name;
+               printed = scnprintf(bf, bfsize, "%s", cl->ms.sym->name);
+       else
+               printed = scnprintf(bf, bfsize, "%#" PRIx64, cl->ip);
+
+       if (show_dso)
+               scnprintf(bf + printed, bfsize - printed, " %s",
+                         cl->ms.map ? cl->ms.map->dso->short_name : "unknown");
  
-       snprintf(bf, bfsize, "%#" PRIx64, cl->ip);
         return bf;
  }
  
@@ -417,7 +425,7 @@ static int hist_browser__show_callchain_node_rb_tree(struct hist_browser *browse
                 remaining -= cumul;
  
                 list_for_each_entry(chain, &child->val, list) {
-                       char ipstr[BITS_PER_LONG / 4 + 1], *alloc_str;
+                       char bf[1024], *alloc_str;
                         const char *str;
                         int color;
                         bool was_first = first;
@@ -434,7 +442,8 @@ static int hist_browser__show_callchain_node_rb_tree(struct hist_browser *browse
                         }
  
                         alloc_str = NULL;
-                       str = callchain_list__sym_name(chain, ipstr, sizeof(ipstr));
+                       str = callchain_list__sym_name(chain, bf, sizeof(bf),
+                                                      browser->show_dso);
                         if (was_first) {
                                 double percent = cumul * 100.0 / new_total;
  
@@ -493,7 +502,7 @@ static int hist_browser__show_callchain_node(struct hist_browser *browser,
         char folded_sign = ' ';
  
         list_for_each_entry(chain, &node->val, list) {
-               char ipstr[BITS_PER_LONG / 4 + 1], *s;
+               char bf[1024], *s;
                 int color;
  
                 folded_sign = callchain_list__folded(chain);
@@ -510,7 +519,8 @@ static int hist_browser__show_callchain_node(struct hist_browser *browser,
                         *is_current_entry = true;
                 }
  
-               s = callchain_list__sym_name(chain, ipstr, sizeof(ipstr));
+               s = callchain_list__sym_name(chain, bf, sizeof(bf),
+                                            browser->show_dso);
                 ui_browser__gotorc(&browser->b, row, 0);
                 ui_browser__set_color(&browser->b, color);
                 slsmg_write_nstring(" ", offset);
@@ -576,7 +586,7 @@ static int hist_browser__show_entry(struct hist_browser *browser,
         }
  
         if (row_offset == 0) {
-               hist_entry__snprintf(entry, s, sizeof(s), browser->hists);
+               hist_entry__sort_snprintf(entry, s, sizeof(s), browser->hists);
                 percent = (entry->period * 100.0) / browser->hists->stats.total_period;
  
                 ui_browser__set_percent_color(&browser->b, percent, current_entry);
@@ -830,7 +840,7 @@ static int hist_browser__fprintf_callchain_node_rb_tree(struct hist_browser *bro
                 remaining -= cumul;
  
                 list_for_each_entry(chain, &child->val, list) {
-                       char ipstr[BITS_PER_LONG / 4 + 1], *alloc_str;
+                       char bf[1024], *alloc_str;
                         const char *str;
                         bool was_first = first;
  
@@ -842,7 +852,8 @@ static int hist_browser__fprintf_callchain_node_rb_tree(struct hist_browser *bro
                         folded_sign = callchain_list__folded(chain);
  
                         alloc_str = NULL;
-                       str = callchain_list__sym_name(chain, ipstr, sizeof(ipstr));
+                       str = callchain_list__sym_name(chain, bf, sizeof(bf),
+                                                      browser->show_dso);
                         if (was_first) {
                                 double percent = cumul * 100.0 / new_total;
  
@@ -880,10 +891,10 @@ static int hist_browser__fprintf_callchain_node(struct hist_browser *browser,
         int printed = 0;
  
         list_for_each_entry(chain, &node->val, list) {
-               char ipstr[BITS_PER_LONG / 4 + 1], *s;
+               char bf[1024], *s;
  
                 folded_sign = callchain_list__folded(chain);
-               s = callchain_list__sym_name(chain, ipstr, sizeof(ipstr));
+               s = callchain_list__sym_name(chain, bf, sizeof(bf), browser->show_dso);
                 printed += fprintf(fp, "%*s%c %s\n", offset, " ", folded_sign, s);
         }
  
@@ -920,7 +931,7 @@ static int hist_browser__fprintf_entry(struct hist_browser *browser,
         if (symbol_conf.use_callchain)
                 folded_sign = hist_entry__folded(he);
  
-       hist_entry__snprintf(he, s, sizeof(s), browser->hists);
+       hist_entry__sort_snprintf(he, s, sizeof(s), browser->hists);
         percent = (he->period * 100.0) / browser->hists->stats.total_period;
  
         if (symbol_conf.use_callchain)
@@ -1133,6 +1144,9 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
                         continue;
                 case 'd':
                         goto zoom_dso;
+               case 'V':
+                       browser->show_dso = !browser->show_dso;
+                       continue;
                 case 't':
                         goto zoom_thread;
                 case '/':
@@ -1164,6 +1178,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
                                         "d             Zoom into current DSO\n"
                                         "t             Zoom into current Thread\n"
                                         "P             Print histograms to perf.hist.N\n"
+                                       "V             Verbose (DSO names in callchains, etc)\n"
                                         "/             Filter symbol by name");
                         continue;
                 case K_ENTER:
diff --git a/tools/perf/ui/gtk/browser.c b/tools/perf/ui/gtk/browser.c

index ec12e0b4ded6567d1c673049cf39cea0f93891f0..26b5b652a8cded852c1e51d2ba49324a88fa3ae0 100644 (file)
--- a/tools/perf/ui/gtk/browser.c
+++ b/tools/perf/ui/gtk/browser.c
@@ -3,6 +3,7 @@
  #include "../evsel.h"
  #include "../sort.h"
  #include "../hist.h"
+#include "../helpline.h"
  #include "gtk.h"
  
  #include <signal.h>
@@ -166,7 +167,7 @@ static GtkWidget *perf_gtk__setup_statusbar(void)
  }
  
  int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist,
-                                 const char *help __used,
+                                 const char *help,
                                   void (*timer) (void *arg)__used,
                                   void *arg __used, int delay_secs __used)
  {
@@ -233,6 +234,8 @@ int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist,
  
         gtk_window_set_position(GTK_WINDOW(window), GTK_WIN_POS_CENTER);
  
+       ui_helpline__push(help);
+
         gtk_main();
  
         perf_gtk__deactivate_context(&pgctx);
diff --git a/tools/perf/ui/gtk/gtk.h b/tools/perf/ui/gtk/gtk.h

index a4d0f2b4a2dcf4c4b6bfb9b21a52aabc4b9376ca..793cb6116ddf275132a8d5950e744648d33141f6 100644 (file)
--- a/tools/perf/ui/gtk/gtk.h
+++ b/tools/perf/ui/gtk/gtk.h
@@ -29,6 +29,8 @@ static inline bool perf_gtk__is_active_context(struct perf_gtk_context *ctx)
  struct perf_gtk_context *perf_gtk__activate_context(GtkWidget *window);
  int perf_gtk__deactivate_context(struct perf_gtk_context **ctx);
  
+void perf_gtk__init_helpline(void);
+
  #ifndef HAVE_GTK_INFO_BAR
  static inline GtkWidget *perf_gtk__setup_info_bar(void)
  {
diff --git a/tools/perf/ui/gtk/helpline.c b/tools/perf/ui/gtk/helpline.c

new file mode 100644 (file)

index 0000000..5db4432
--- /dev/null
+++ b/tools/perf/ui/gtk/helpline.c
@@ -0,0 +1,56 @@
+#include <stdio.h>
+#include <string.h>
+
+#include "gtk.h"
+#include "../ui.h"
+#include "../helpline.h"
+#include "../../util/debug.h"
+
+static void gtk_helpline_pop(void)
+{
+       if (!perf_gtk__is_active_context(pgctx))
+               return;
+
+       gtk_statusbar_pop(GTK_STATUSBAR(pgctx->statbar),
+                         pgctx->statbar_ctx_id);
+}
+
+static void gtk_helpline_push(const char *msg)
+{
+       if (!perf_gtk__is_active_context(pgctx))
+               return;
+
+       gtk_statusbar_push(GTK_STATUSBAR(pgctx->statbar),
+                          pgctx->statbar_ctx_id, msg);
+}
+
+static struct ui_helpline gtk_helpline_fns = {
+       .pop    = gtk_helpline_pop,
+       .push   = gtk_helpline_push,
+};
+
+void perf_gtk__init_helpline(void)
+{
+       helpline_fns = &gtk_helpline_fns;
+}
+
+int perf_gtk__show_helpline(const char *fmt, va_list ap)
+{
+       int ret;
+       char *ptr;
+       static int backlog;
+
+       ret = vscnprintf(ui_helpline__current + backlog,
+                        sizeof(ui_helpline__current) - backlog, fmt, ap);
+       backlog += ret;
+
+       /* only first line can be displayed */
+       ptr = strchr(ui_helpline__current, '\n');
+       if (ptr && (ptr - ui_helpline__current) <= backlog) {
+               *ptr = '\0';
+               ui_helpline__puts(ui_helpline__current);
+               backlog = 0;
+       }
+
+       return ret;
+}
diff --git a/tools/perf/ui/gtk/setup.c b/tools/perf/ui/gtk/setup.c

index 92879ce61e2fc0059f6827c38b9ff8eaf114adba..ec1ee26b485a1491f4ce47737072fb325adbad30 100644 (file)
--- a/tools/perf/ui/gtk/setup.c
+++ b/tools/perf/ui/gtk/setup.c
@@ -7,11 +7,14 @@ extern struct perf_error_ops perf_gtk_eops;
  int perf_gtk__init(void)
  {
         perf_error__register(&perf_gtk_eops);
+       perf_gtk__init_helpline();
         return gtk_init_check(NULL, NULL) ? 0 : -1;
  }
  
  void perf_gtk__exit(bool wait_for_ok __used)
  {
+       if (!perf_gtk__is_active_context(pgctx))
+               return;
         perf_error__unregister(&perf_gtk_eops);
         gtk_main_quit();
  }
diff --git a/tools/perf/ui/gtk/util.c b/tools/perf/ui/gtk/util.c

index 0ead373c0dfb929495544263730a2c4c51a0d13f..b8efb966f94c0a1b8675a7453fcc8fdb410c9866 100644 (file)
--- a/tools/perf/ui/gtk/util.c
+++ b/tools/perf/ui/gtk/util.c
@@ -117,11 +117,6 @@ struct perf_error_ops perf_gtk_eops = {
   *        For now, just add stubs for NO_NEWT=1 build.
   */
  #ifdef NO_NEWT_SUPPORT
-int ui_helpline__show_help(const char *format __used, va_list ap __used)
-{
-       return 0;
-}
-
  void ui_progress__update(u64 curr __used, u64 total __used,
                          const char *title __used)
  {
diff --git a/tools/perf/ui/helpline.c b/tools/perf/ui/helpline.c

index 2f950c2641c8bf8ad73ffbb7376850acc647e8c9..78ba28ac7a2ceb3fc756c49e06b16618281bf93e 100644 (file)
--- a/tools/perf/ui/helpline.c
+++ b/tools/perf/ui/helpline.c
@@ -5,23 +5,32 @@
  #include "../debug.h"
  #include "helpline.h"
  #include "ui.h"
-#include "libslang.h"
  
-void ui_helpline__pop(void)
+char ui_helpline__current[512];
+
+static void nop_helpline__pop(void)
  {
  }
  
-char ui_helpline__current[512];
+static void nop_helpline__push(const char *msg __used)
+{
+}
  
-void ui_helpline__push(const char *msg)
+static struct ui_helpline default_helpline_fns = {
+       .pop    = nop_helpline__pop,
+       .push   = nop_helpline__push,
+};
+
+struct ui_helpline *helpline_fns = &default_helpline_fns;
+
+void ui_helpline__pop(void)
  {
-       const size_t sz = sizeof(ui_helpline__current);
+       helpline_fns->pop();
+}
  
-       SLsmg_gotorc(SLtt_Screen_Rows - 1, 0);
-       SLsmg_set_color(0);
-       SLsmg_write_nstring((char *)msg, SLtt_Screen_Cols);
-       SLsmg_refresh();
-       strncpy(ui_helpline__current, msg, sz)[sz - 1] = '\0';
+void ui_helpline__push(const char *msg)
+{
+       helpline_fns->push(msg);
  }
  
  void ui_helpline__vpush(const char *fmt, va_list ap)
@@ -50,30 +59,3 @@ void ui_helpline__puts(const char *msg)
         ui_helpline__pop();
         ui_helpline__push(msg);
  }
-
-void ui_helpline__init(void)
-{
-       ui_helpline__puts(" ");
-}
-
-char ui_helpline__last_msg[1024];
-
-int ui_helpline__show_help(const char *format, va_list ap)
-{
-       int ret;
-       static int backlog;
-
-       pthread_mutex_lock(&ui__lock);
-       ret = vscnprintf(ui_helpline__last_msg + backlog,
-                       sizeof(ui_helpline__last_msg) - backlog, format, ap);
-       backlog += ret;
-
-       if (ui_helpline__last_msg[backlog - 1] == '\n') {
-               ui_helpline__puts(ui_helpline__last_msg);
-               SLsmg_refresh();
-               backlog = 0;
-       }
-       pthread_mutex_unlock(&ui__lock);
-
-       return ret;
-}
diff --git a/tools/perf/ui/helpline.h b/tools/perf/ui/helpline.h

index 7bab6b34e35ee7b9869c372373926e2f89d4dd57..a2487f93aa48cc4bc53610c5c704c931ddf09c76 100644 (file)
--- a/tools/perf/ui/helpline.h
+++ b/tools/perf/ui/helpline.h
@@ -4,13 +4,44 @@
  #include <stdio.h>
  #include <stdarg.h>
  
+#include "../util/cache.h"
+
+struct ui_helpline {
+       void (*pop)(void);
+       void (*push)(const char *msg);
+};
+
+extern struct ui_helpline *helpline_fns;
+
  void ui_helpline__init(void);
+
  void ui_helpline__pop(void);
  void ui_helpline__push(const char *msg);
  void ui_helpline__vpush(const char *fmt, va_list ap);
  void ui_helpline__fpush(const char *fmt, ...);
  void ui_helpline__puts(const char *msg);
  
-extern char ui_helpline__current[];
+extern char ui_helpline__current[512];
+
+#ifdef NO_NEWT_SUPPORT
+static inline int ui_helpline__show_help(const char *format __used,
+                                        va_list ap __used)
+{
+       return 0;
+}
+#else
+extern char ui_helpline__last_msg[];
+int ui_helpline__show_help(const char *format, va_list ap);
+#endif /* NO_NEWT_SUPPORT */
+
+#ifdef NO_GTK2_SUPPORT
+static inline int perf_gtk__show_helpline(const char *format __used,
+                                         va_list ap __used)
+{
+       return 0;
+}
+#else
+int perf_gtk__show_helpline(const char *format, va_list ap);
+#endif /* NO_GTK2_SUPPORT */
  
  #endif /* _PERF_UI_HELPLINE_H_ */
diff --git a/tools/perf/ui/setup.c b/tools/perf/ui/setup.c

index 791fb15ce3507c2d42d695be12c1f87a16affa0f..c7820e56966014fa106e4cd2309651df6a97fe5f 100644 (file)
--- a/tools/perf/ui/setup.c
+++ b/tools/perf/ui/setup.c
@@ -1,7 +1,11 @@
+#include <pthread.h>
+
  #include "../cache.h"
  #include "../debug.h"
  
  
+pthread_mutex_t ui__lock = PTHREAD_MUTEX_INITIALIZER;
+
  void setup_browser(bool fallback_to_pager)
  {
         if (!isatty(1) || dump_trace)
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c

new file mode 100644 (file)

index 0000000..9bf7e9e
--- /dev/null
+++ b/tools/perf/ui/stdio/hist.c
@@ -0,0 +1,653 @@
+#include <stdio.h>
+#include <math.h>
+
+#include "../../util/util.h"
+#include "../../util/hist.h"
+#include "../../util/sort.h"
+
+
+static size_t callchain__fprintf_left_margin(FILE *fp, int left_margin)
+{
+       int i;
+       int ret = fprintf(fp, "            ");
+
+       for (i = 0; i < left_margin; i++)
+               ret += fprintf(fp, " ");
+
+       return ret;
+}
+
+static size_t ipchain__fprintf_graph_line(FILE *fp, int depth, int depth_mask,
+                                         int left_margin)
+{
+       int i;
+       size_t ret = callchain__fprintf_left_margin(fp, left_margin);
+
+       for (i = 0; i < depth; i++)
+               if (depth_mask & (1 << i))
+                       ret += fprintf(fp, "|          ");
+               else
+                       ret += fprintf(fp, "           ");
+
+       ret += fprintf(fp, "\n");
+
+       return ret;
+}
+
+static size_t ipchain__fprintf_graph(FILE *fp, struct callchain_list *chain,
+                                    int depth, int depth_mask, int period,
+                                    u64 total_samples, u64 hits,
+                                    int left_margin)
+{
+       int i;
+       size_t ret = 0;
+
+       ret += callchain__fprintf_left_margin(fp, left_margin);
+       for (i = 0; i < depth; i++) {
+               if (depth_mask & (1 << i))
+                       ret += fprintf(fp, "|");
+               else
+                       ret += fprintf(fp, " ");
+               if (!period && i == depth - 1) {
+                       double percent;
+
+                       percent = hits * 100.0 / total_samples;
+                       ret += percent_color_fprintf(fp, "--%2.2f%%-- ", percent);
+               } else
+                       ret += fprintf(fp, "%s", "          ");
+       }
+       if (chain->ms.sym)
+               ret += fprintf(fp, "%s\n", chain->ms.sym->name);
+       else
+               ret += fprintf(fp, "0x%0" PRIx64 "\n", chain->ip);
+
+       return ret;
+}
+
+static struct symbol *rem_sq_bracket;
+static struct callchain_list rem_hits;
+
+static void init_rem_hits(void)
+{
+       rem_sq_bracket = malloc(sizeof(*rem_sq_bracket) + 6);
+       if (!rem_sq_bracket) {
+               fprintf(stderr, "Not enough memory to display remaining hits\n");
+               return;
+       }
+
+       strcpy(rem_sq_bracket->name, "[...]");
+       rem_hits.ms.sym = rem_sq_bracket;
+}
+
+static size_t __callchain__fprintf_graph(FILE *fp, struct rb_root *root,
+                                        u64 total_samples, int depth,
+                                        int depth_mask, int left_margin)
+{
+       struct rb_node *node, *next;
+       struct callchain_node *child;
+       struct callchain_list *chain;
+       int new_depth_mask = depth_mask;
+       u64 remaining;
+       size_t ret = 0;
+       int i;
+       uint entries_printed = 0;
+
+       remaining = total_samples;
+
+       node = rb_first(root);
+       while (node) {
+               u64 new_total;
+               u64 cumul;
+
+               child = rb_entry(node, struct callchain_node, rb_node);
+               cumul = callchain_cumul_hits(child);
+               remaining -= cumul;
+
+               /*
+                * The depth mask manages the output of pipes that show
+                * the depth. We don't want to keep the pipes of the current
+                * level for the last child of this depth.
+                * Except if we have remaining filtered hits. They will
+                * supersede the last child
+                */
+               next = rb_next(node);
+               if (!next && (callchain_param.mode != CHAIN_GRAPH_REL || !remaining))
+                       new_depth_mask &= ~(1 << (depth - 1));
+
+               /*
+                * But we keep the older depth mask for the line separator
+                * to keep the level link until we reach the last child
+                */
+               ret += ipchain__fprintf_graph_line(fp, depth, depth_mask,
+                                                  left_margin);
+               i = 0;
+               list_for_each_entry(chain, &child->val, list) {
+                       ret += ipchain__fprintf_graph(fp, chain, depth,
+                                                     new_depth_mask, i++,
+                                                     total_samples,
+                                                     cumul,
+                                                     left_margin);
+               }
+
+               if (callchain_param.mode == CHAIN_GRAPH_REL)
+                       new_total = child->children_hit;
+               else
+                       new_total = total_samples;
+
+               ret += __callchain__fprintf_graph(fp, &child->rb_root, new_total,
+                                                 depth + 1,
+                                                 new_depth_mask | (1 << depth),
+                                                 left_margin);
+               node = next;
+               if (++entries_printed == callchain_param.print_limit)
+                       break;
+       }
+
+       if (callchain_param.mode == CHAIN_GRAPH_REL &&
+               remaining && remaining != total_samples) {
+
+               if (!rem_sq_bracket)
+                       return ret;
+
+               new_depth_mask &= ~(1 << (depth - 1));
+               ret += ipchain__fprintf_graph(fp, &rem_hits, depth,
+                                             new_depth_mask, 0, total_samples,
+                                             remaining, left_margin);
+       }
+
+       return ret;
+}
+
+static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
+                                      u64 total_samples, int left_margin)
+{
+       struct callchain_node *cnode;
+       struct callchain_list *chain;
+       u32 entries_printed = 0;
+       bool printed = false;
+       struct rb_node *node;
+       int i = 0;
+       int ret = 0;
+
+       /*
+        * If have one single callchain root, don't bother printing
+        * its percentage (100 % in fractal mode and the same percentage
+        * than the hist in graph mode). This also avoid one level of column.
+        */
+       node = rb_first(root);
+       if (node && !rb_next(node)) {
+               cnode = rb_entry(node, struct callchain_node, rb_node);
+               list_for_each_entry(chain, &cnode->val, list) {
+                       /*
+                        * If we sort by symbol, the first entry is the same than
+                        * the symbol. No need to print it otherwise it appears as
+                        * displayed twice.
+                        */
+                       if (!i++ && sort__first_dimension == SORT_SYM)
+                               continue;
+                       if (!printed) {
+                               ret += callchain__fprintf_left_margin(fp, left_margin);
+                               ret += fprintf(fp, "|\n");
+                               ret += callchain__fprintf_left_margin(fp, left_margin);
+                               ret += fprintf(fp, "---");
+                               left_margin += 3;
+                               printed = true;
+                       } else
+                               ret += callchain__fprintf_left_margin(fp, left_margin);
+
+                       if (chain->ms.sym)
+                               ret += fprintf(fp, " %s\n", chain->ms.sym->name);
+                       else
+                               ret += fprintf(fp, " %p\n", (void *)(long)chain->ip);
+
+                       if (++entries_printed == callchain_param.print_limit)
+                               break;
+               }
+               root = &cnode->rb_root;
+       }
+
+       ret += __callchain__fprintf_graph(fp, root, total_samples,
+                                         1, 1, left_margin);
+       ret += fprintf(fp, "\n");
+
+       return ret;
+}
+
+static size_t __callchain__fprintf_flat(FILE *fp,
+                                       struct callchain_node *self,
+                                       u64 total_samples)
+{
+       struct callchain_list *chain;
+       size_t ret = 0;
+
+       if (!self)
+               return 0;
+
+       ret += __callchain__fprintf_flat(fp, self->parent, total_samples);
+
+
+       list_for_each_entry(chain, &self->val, list) {
+               if (chain->ip >= PERF_CONTEXT_MAX)
+                       continue;
+               if (chain->ms.sym)
+                       ret += fprintf(fp, "                %s\n", chain->ms.sym->name);
+               else
+                       ret += fprintf(fp, "                %p\n",
+                                       (void *)(long)chain->ip);
+       }
+
+       return ret;
+}
+
+static size_t callchain__fprintf_flat(FILE *fp, struct rb_root *self,
+                                     u64 total_samples)
+{
+       size_t ret = 0;
+       u32 entries_printed = 0;
+       struct rb_node *rb_node;
+       struct callchain_node *chain;
+
+       rb_node = rb_first(self);
+       while (rb_node) {
+               double percent;
+
+               chain = rb_entry(rb_node, struct callchain_node, rb_node);
+               percent = chain->hit * 100.0 / total_samples;
+
+               ret = percent_color_fprintf(fp, "           %6.2f%%\n", percent);
+               ret += __callchain__fprintf_flat(fp, chain, total_samples);
+               ret += fprintf(fp, "\n");
+               if (++entries_printed == callchain_param.print_limit)
+                       break;
+
+               rb_node = rb_next(rb_node);
+       }
+
+       return ret;
+}
+
+static size_t hist_entry_callchain__fprintf(struct hist_entry *he,
+                                           u64 total_samples, int left_margin,
+                                           FILE *fp)
+{
+       switch (callchain_param.mode) {
+       case CHAIN_GRAPH_REL:
+               return callchain__fprintf_graph(fp, &he->sorted_chain, he->period,
+                                               left_margin);
+               break;
+       case CHAIN_GRAPH_ABS:
+               return callchain__fprintf_graph(fp, &he->sorted_chain, total_samples,
+                                               left_margin);
+               break;
+       case CHAIN_FLAT:
+               return callchain__fprintf_flat(fp, &he->sorted_chain, total_samples);
+               break;
+       case CHAIN_NONE:
+               break;
+       default:
+               pr_err("Bad callchain mode\n");
+       }
+
+       return 0;
+}
+
+static int hist_entry__period_snprintf(struct hist_entry *he, char *s,
+                                    size_t size, struct hists *pair_hists,
+                                    bool show_displacement, long displacement,
+                                    bool color, u64 total_period)
+{
+       u64 period, total, period_sys, period_us, period_guest_sys, period_guest_us;
+       u64 nr_events;
+       const char *sep = symbol_conf.field_sep;
+       int ret;
+
+       if (symbol_conf.exclude_other && !he->parent)
+               return 0;
+
+       if (pair_hists) {
+               period = he->pair ? he->pair->period : 0;
+               nr_events = he->pair ? he->pair->nr_events : 0;
+               total = pair_hists->stats.total_period;
+               period_sys = he->pair ? he->pair->period_sys : 0;
+               period_us = he->pair ? he->pair->period_us : 0;
+               period_guest_sys = he->pair ? he->pair->period_guest_sys : 0;
+               period_guest_us = he->pair ? he->pair->period_guest_us : 0;
+       } else {
+               period = he->period;
+               nr_events = he->nr_events;
+               total = total_period;
+               period_sys = he->period_sys;
+               period_us = he->period_us;
+               period_guest_sys = he->period_guest_sys;
+               period_guest_us = he->period_guest_us;
+       }
+
+       if (total) {
+               if (color)
+                       ret = percent_color_snprintf(s, size,
+                                                    sep ? "%.2f" : "   %6.2f%%",
+                                                    (period * 100.0) / total);
+               else
+                       ret = scnprintf(s, size, sep ? "%.2f" : "   %6.2f%%",
+                                      (period * 100.0) / total);
+               if (symbol_conf.show_cpu_utilization) {
+                       ret += percent_color_snprintf(s + ret, size - ret,
+                                       sep ? "%.2f" : "   %6.2f%%",
+                                       (period_sys * 100.0) / total);
+                       ret += percent_color_snprintf(s + ret, size - ret,
+                                       sep ? "%.2f" : "   %6.2f%%",
+                                       (period_us * 100.0) / total);
+                       if (perf_guest) {
+                               ret += percent_color_snprintf(s + ret,
+                                               size - ret,
+                                               sep ? "%.2f" : "   %6.2f%%",
+                                               (period_guest_sys * 100.0) /
+                                                               total);
+                               ret += percent_color_snprintf(s + ret,
+                                               size - ret,
+                                               sep ? "%.2f" : "   %6.2f%%",
+                                               (period_guest_us * 100.0) /
+                                                               total);
+                       }
+               }
+       } else
+               ret = scnprintf(s, size, sep ? "%" PRIu64 : "%12" PRIu64 " ", period);
+
+       if (symbol_conf.show_nr_samples) {
+               if (sep)
+                       ret += scnprintf(s + ret, size - ret, "%c%" PRIu64, *sep, nr_events);
+               else
+                       ret += scnprintf(s + ret, size - ret, "%11" PRIu64, nr_events);
+       }
+
+       if (symbol_conf.show_total_period) {
+               if (sep)
+                       ret += scnprintf(s + ret, size - ret, "%c%" PRIu64, *sep, period);
+               else
+                       ret += scnprintf(s + ret, size - ret, " %12" PRIu64, period);
+       }
+
+       if (pair_hists) {
+               char bf[32];
+               double old_percent = 0, new_percent = 0, diff;
+
+               if (total > 0)
+                       old_percent = (period * 100.0) / total;
+               if (total_period > 0)
+                       new_percent = (he->period * 100.0) / total_period;
+
+               diff = new_percent - old_percent;
+
+               if (fabs(diff) >= 0.01)
+                       scnprintf(bf, sizeof(bf), "%+4.2F%%", diff);
+               else
+                       scnprintf(bf, sizeof(bf), " ");
+
+               if (sep)
+                       ret += scnprintf(s + ret, size - ret, "%c%s", *sep, bf);
+               else
+                       ret += scnprintf(s + ret, size - ret, "%11.11s", bf);
+
+               if (show_displacement) {
+                       if (displacement)
+                               scnprintf(bf, sizeof(bf), "%+4ld", displacement);
+                       else
+                               scnprintf(bf, sizeof(bf), " ");
+
+                       if (sep)
+                               ret += scnprintf(s + ret, size - ret, "%c%s", *sep, bf);
+                       else
+                               ret += scnprintf(s + ret, size - ret, "%6.6s", bf);
+               }
+       }
+
+       return ret;
+}
+
+int hist_entry__sort_snprintf(struct hist_entry *he, char *s, size_t size,
+                             struct hists *hists)
+{
+       const char *sep = symbol_conf.field_sep;
+       struct sort_entry *se;
+       int ret = 0;
+
+       list_for_each_entry(se, &hist_entry__sort_list, list) {
+               if (se->elide)
+                       continue;
+
+               ret += scnprintf(s + ret, size - ret, "%s", sep ?: "  ");
+               ret += se->se_snprintf(he, s + ret, size - ret,
+                                      hists__col_len(hists, se->se_width_idx));
+       }
+
+       return ret;
+}
+
+static size_t hist_entry__callchain_fprintf(struct hist_entry *he,
+                                           struct hists *hists,
+                                           u64 total_period, FILE *fp)
+{
+       int left_margin = 0;
+
+       if (sort__first_dimension == SORT_COMM) {
+               struct sort_entry *se = list_first_entry(&hist_entry__sort_list,
+                                                        typeof(*se), list);
+               left_margin = hists__col_len(hists, se->se_width_idx);
+               left_margin -= thread__comm_len(he->thread);
+       }
+
+       return hist_entry_callchain__fprintf(he, total_period, left_margin, fp);
+}
+
+static int hist_entry__fprintf(struct hist_entry *he, size_t size,
+                              struct hists *hists, struct hists *pair_hists,
+                              bool show_displacement, long displacement,
+                              u64 total_period, FILE *fp)
+{
+       char bf[512];
+       int ret;
+
+       if (size == 0 || size > sizeof(bf))
+               size = sizeof(bf);
+
+       ret = hist_entry__period_snprintf(he, bf, size, pair_hists,
+                                         show_displacement, displacement,
+                                         true, total_period);
+       hist_entry__sort_snprintf(he, bf + ret, size - ret, hists);
+
+       ret = fprintf(fp, "%s\n", bf);
+
+       if (symbol_conf.use_callchain)
+               ret += hist_entry__callchain_fprintf(he, hists,
+                                                    total_period, fp);
+
+       return ret;
+}
+
+size_t hists__fprintf(struct hists *hists, struct hists *pair,
+                     bool show_displacement, bool show_header, int max_rows,
+                     int max_cols, FILE *fp)
+{
+       struct sort_entry *se;
+       struct rb_node *nd;
+       size_t ret = 0;
+       u64 total_period;
+       unsigned long position = 1;
+       long displacement = 0;
+       unsigned int width;
+       const char *sep = symbol_conf.field_sep;
+       const char *col_width = symbol_conf.col_width_list_str;
+       int nr_rows = 0;
+
+       init_rem_hits();
+
+       if (!show_header)
+               goto print_entries;
+
+       fprintf(fp, "# %s", pair ? "Baseline" : "Overhead");
+
+       if (symbol_conf.show_cpu_utilization) {
+               if (sep) {
+                       ret += fprintf(fp, "%csys", *sep);
+                       ret += fprintf(fp, "%cus", *sep);
+                       if (perf_guest) {
+                               ret += fprintf(fp, "%cguest sys", *sep);
+                               ret += fprintf(fp, "%cguest us", *sep);
+                       }
+               } else {
+                       ret += fprintf(fp, "     sys  ");
+                       ret += fprintf(fp, "      us  ");
+                       if (perf_guest) {
+                               ret += fprintf(fp, "  guest sys  ");
+                               ret += fprintf(fp, "  guest us  ");
+                       }
+               }
+       }
+
+       if (symbol_conf.show_nr_samples) {
+               if (sep)
+                       fprintf(fp, "%cSamples", *sep);
+               else
+                       fputs("  Samples  ", fp);
+       }
+
+       if (symbol_conf.show_total_period) {
+               if (sep)
+                       ret += fprintf(fp, "%cPeriod", *sep);
+               else
+                       ret += fprintf(fp, "   Period    ");
+       }
+
+       if (pair) {
+               if (sep)
+                       ret += fprintf(fp, "%cDelta", *sep);
+               else
+                       ret += fprintf(fp, "  Delta    ");
+
+               if (show_displacement) {
+                       if (sep)
+                               ret += fprintf(fp, "%cDisplacement", *sep);
+                       else
+                               ret += fprintf(fp, " Displ");
+               }
+       }
+
+       list_for_each_entry(se, &hist_entry__sort_list, list) {
+               if (se->elide)
+                       continue;
+               if (sep) {
+                       fprintf(fp, "%c%s", *sep, se->se_header);
+                       continue;
+               }
+               width = strlen(se->se_header);
+               if (symbol_conf.col_width_list_str) {
+                       if (col_width) {
+                               hists__set_col_len(hists, se->se_width_idx,
+                                                  atoi(col_width));
+                               col_width = strchr(col_width, ',');
+                               if (col_width)
+                                       ++col_width;
+                       }
+               }
+               if (!hists__new_col_len(hists, se->se_width_idx, width))
+                       width = hists__col_len(hists, se->se_width_idx);
+               fprintf(fp, "  %*s", width, se->se_header);
+       }
+
+       fprintf(fp, "\n");
+       if (max_rows && ++nr_rows >= max_rows)
+               goto out;
+
+       if (sep)
+               goto print_entries;
+
+       fprintf(fp, "# ........");
+       if (symbol_conf.show_cpu_utilization)
+               fprintf(fp, "   .......   .......");
+       if (symbol_conf.show_nr_samples)
+               fprintf(fp, " ..........");
+       if (symbol_conf.show_total_period)
+               fprintf(fp, " ............");
+       if (pair) {
+               fprintf(fp, " ..........");
+               if (show_displacement)
+                       fprintf(fp, " .....");
+       }
+       list_for_each_entry(se, &hist_entry__sort_list, list) {
+               unsigned int i;
+
+               if (se->elide)
+                       continue;
+
+               fprintf(fp, "  ");
+               width = hists__col_len(hists, se->se_width_idx);
+               if (width == 0)
+                       width = strlen(se->se_header);
+               for (i = 0; i < width; i++)
+                       fprintf(fp, ".");
+       }
+
+       fprintf(fp, "\n");
+       if (max_rows && ++nr_rows >= max_rows)
+               goto out;
+
+       fprintf(fp, "#\n");
+       if (max_rows && ++nr_rows >= max_rows)
+               goto out;
+
+print_entries:
+       total_period = hists->stats.total_period;
+
+       for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
+               struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+
+               if (h->filtered)
+                       continue;
+
+               if (show_displacement) {
+                       if (h->pair != NULL)
+                               displacement = ((long)h->pair->position -
+                                               (long)position);
+                       else
+                               displacement = 0;
+                       ++position;
+               }
+               ret += hist_entry__fprintf(h, max_cols, hists, pair, show_displacement,
+                                          displacement, total_period, fp);
+
+               if (max_rows && ++nr_rows >= max_rows)
+                       goto out;
+
+               if (h->ms.map == NULL && verbose > 1) {
+                       __map_groups__fprintf_maps(&h->thread->mg,
+                                                  MAP__FUNCTION, verbose, fp);
+                       fprintf(fp, "%.10s end\n", graph_dotted_line);
+               }
+       }
+out:
+       free(rem_sq_bracket);
+
+       return ret;
+}
+
+size_t hists__fprintf_nr_events(struct hists *hists, FILE *fp)
+{
+       int i;
+       size_t ret = 0;
+
+       for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) {
+               const char *name;
+
+               if (hists->stats.nr_events[i] == 0)
+                       continue;
+
+               name = perf_event__name(i);
+               if (!strcmp(name, "UNKNOWN"))
+                       continue;
+
+               ret += fprintf(fp, "%16s events: %10d\n", name,
+                              hists->stats.nr_events[i]);
+       }
+
+       return ret;
+}
diff --git a/tools/perf/ui/tui/helpline.c b/tools/perf/ui/tui/helpline.c

new file mode 100644 (file)

index 0000000..2884d2f
--- /dev/null
+++ b/tools/perf/ui/tui/helpline.c
@@ -0,0 +1,57 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "../../util/debug.h"
+#include "../helpline.h"
+#include "../ui.h"
+#include "../libslang.h"
+
+static void tui_helpline__pop(void)
+{
+}
+
+static void tui_helpline__push(const char *msg)
+{
+       const size_t sz = sizeof(ui_helpline__current);
+
+       SLsmg_gotorc(SLtt_Screen_Rows - 1, 0);
+       SLsmg_set_color(0);
+       SLsmg_write_nstring((char *)msg, SLtt_Screen_Cols);
+       SLsmg_refresh();
+       strncpy(ui_helpline__current, msg, sz)[sz - 1] = '\0';
+}
+
+struct ui_helpline tui_helpline_fns = {
+       .pop    = tui_helpline__pop,
+       .push   = tui_helpline__push,
+};
+
+void ui_helpline__init(void)
+{
+       helpline_fns = &tui_helpline_fns;
+       ui_helpline__puts(" ");
+}
+
+char ui_helpline__last_msg[1024];
+
+int ui_helpline__show_help(const char *format, va_list ap)
+{
+       int ret;
+       static int backlog;
+
+       pthread_mutex_lock(&ui__lock);
+       ret = vscnprintf(ui_helpline__last_msg + backlog,
+                       sizeof(ui_helpline__last_msg) - backlog, format, ap);
+       backlog += ret;
+
+       if (ui_helpline__last_msg[backlog - 1] == '\n') {
+               ui_helpline__puts(ui_helpline__last_msg);
+               SLsmg_refresh();
+               backlog = 0;
+       }
+       pthread_mutex_unlock(&ui__lock);
+
+       return ret;
+}
diff --git a/tools/perf/ui/tui/setup.c b/tools/perf/ui/tui/setup.c

index e813c1d173463564a496bdd3917f3079089a3101..4c936e09931c65812e6e7d0f3c32f32b5cc90982 100644 (file)
--- a/tools/perf/ui/tui/setup.c
+++ b/tools/perf/ui/tui/setup.c
@@ -11,8 +11,6 @@
  #include "../libslang.h"
  #include "../keysyms.h"
  
-pthread_mutex_t ui__lock = PTHREAD_MUTEX_INITIALIZER;
-
  static volatile int ui__need_resize;
  
  extern struct perf_error_ops perf_tui_eops;
diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c

index 4dfe0bb3c3225455e4f0ff3b1cb74a386352db3c..66eb3828ceb5ec4b2df31147274cc767be09856a 100644 (file)
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -23,8 +23,10 @@ int eprintf(int level, const char *fmt, ...)
  
         if (verbose >= level) {
                 va_start(args, fmt);
-               if (use_browser > 0)
+               if (use_browser == 1)
                         ret = ui_helpline__show_help(fmt, args);
+               else if (use_browser == 2)
+                       ret = perf_gtk__show_helpline(fmt, args);
                 else
                         ret = vfprintf(stderr, fmt, args);
                 va_end(args);
diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h

index 015c91dbc096310554ba6f7bcec51c16d7cabfac..05e660cbf7e2f90f9c237031bf4057e87c52cb47 100644 (file)
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h
@@ -4,6 +4,7 @@
  
  #include <stdbool.h>
  #include "event.h"
+#include "../ui/helpline.h"
  
  extern int verbose;
  extern bool quiet, dump_trace;
@@ -15,11 +16,6 @@ struct ui_progress;
  struct perf_error_ops;
  
  #if defined(NO_NEWT_SUPPORT) && defined(NO_GTK2_SUPPORT)
-static inline int ui_helpline__show_help(const char *format __used, va_list ap __used)
-{
-       return 0;
-}
-
  static inline void ui_progress__update(u64 curr __used, u64 total __used,
                                        const char *title __used) {}
  
@@ -39,8 +35,6 @@ perf_error__unregister(struct perf_error_ops *eops __used)
  
  #else /* NO_NEWT_SUPPORT && NO_GTK2_SUPPORT */
  
-extern char ui_helpline__last_msg[];
-int ui_helpline__show_help(const char *format, va_list ap);
  #include "../ui/progress.h"
  int ui__error(const char *format, ...) __attribute__((format(printf, 1, 2)));
  #include "../ui/util.h"
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c

index 2a6f33cd888ca7d7e9dbe7d27f6999a552dec7f1..3a0f1a5da91cc7a43c108d74b47b519493fa9394 100644 (file)
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -412,7 +412,7 @@ struct process_symbol_args {
  };
  
  static int find_symbol_cb(void *arg, const char *name, char type,
-                         u64 start, u64 end __used)
+                         u64 start)
  {
         struct process_symbol_args *args = arg;
  
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h

index d84870b0642627b11b81b5b5dc1c0abade1a09e4..0e088d046e5666fdee405060ebb75276486440fe 100644 (file)
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -69,6 +69,16 @@ struct sample_event {
         u64 array[];
  };
  
+struct regs_dump {
+       u64 *regs;
+};
+
+struct stack_dump {
+       u16 offset;
+       u64 size;
+       char *data;
+};
+
  struct perf_sample {
         u64 ip;
         u32 pid, tid;
@@ -82,6 +92,8 @@ struct perf_sample {
         void *raw_data;
         struct ip_callchain *callchain;
         struct branch_stack *branch_stack;
+       struct regs_dump  user_regs;
+       struct stack_dump user_stack;
  };
  
  #define BUILD_ID_SIZE 20
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c

index 9b38681add9e2d46e3a02b8ca0ffe48363a3e3c5..4774ac1e3d5f3934f78feaa6f70ad02801b1837d 100644 (file)
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -57,7 +57,7 @@ void perf_evlist__config_attrs(struct perf_evlist *evlist,
         if (evlist->cpus->map[0] < 0)
                 opts->no_inherit = true;
  
-       first = list_entry(evlist->entries.next, struct perf_evsel, node);
+       first = perf_evlist__first(evlist);
  
         list_for_each_entry(evsel, &evlist->entries, node) {
                 perf_evsel__config(evsel, opts, first);
@@ -108,6 +108,25 @@ void perf_evlist__splice_list_tail(struct perf_evlist *evlist,
         evlist->nr_entries += nr_entries;
  }
  
+void __perf_evlist__set_leader(struct list_head *list)
+{
+       struct perf_evsel *evsel, *leader;
+
+       leader = list_entry(list->next, struct perf_evsel, node);
+       leader->leader = NULL;
+
+       list_for_each_entry(evsel, list, node) {
+               if (evsel != leader)
+                       evsel->leader = leader;
+       }
+}
+
+void perf_evlist__set_leader(struct perf_evlist *evlist)
+{
+       if (evlist->nr_entries)
+               __perf_evlist__set_leader(&evlist->entries);
+}
+
  int perf_evlist__add_default(struct perf_evlist *evlist)
  {
         struct perf_event_attr attr = {
@@ -357,7 +376,7 @@ struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id)
         int hash;
  
         if (evlist->nr_entries == 1)
-               return list_entry(evlist->entries.next, struct perf_evsel, node);
+               return perf_evlist__first(evlist);
  
         hash = hash_64(id, PERF_EVLIST__HLIST_BITS);
         head = &evlist->heads[hash];
@@ -367,7 +386,7 @@ struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id)
                         return sid->evsel;
  
         if (!perf_evlist__sample_id_all(evlist))
-               return list_entry(evlist->entries.next, struct perf_evsel, node);
+               return perf_evlist__first(evlist);
  
         return NULL;
  }
@@ -675,11 +694,9 @@ int perf_evlist__set_filters(struct perf_evlist *evlist)
         return 0;
  }
  
-bool perf_evlist__valid_sample_type(const struct perf_evlist *evlist)
+bool perf_evlist__valid_sample_type(struct perf_evlist *evlist)
  {
-       struct perf_evsel *pos, *first;
-
-       pos = first = list_entry(evlist->entries.next, struct perf_evsel, node);
+       struct perf_evsel *first = perf_evlist__first(evlist), *pos = first;
  
         list_for_each_entry_continue(pos, &evlist->entries, node) {
                 if (first->attr.sample_type != pos->attr.sample_type)
@@ -689,23 +706,19 @@ bool perf_evlist__valid_sample_type(const struct perf_evlist *evlist)
         return true;
  }
  
-u64 perf_evlist__sample_type(const struct perf_evlist *evlist)
+u64 perf_evlist__sample_type(struct perf_evlist *evlist)
  {
-       struct perf_evsel *first;
-
-       first = list_entry(evlist->entries.next, struct perf_evsel, node);
+       struct perf_evsel *first = perf_evlist__first(evlist);
         return first->attr.sample_type;
  }
  
-u16 perf_evlist__id_hdr_size(const struct perf_evlist *evlist)
+u16 perf_evlist__id_hdr_size(struct perf_evlist *evlist)
  {
-       struct perf_evsel *first;
+       struct perf_evsel *first = perf_evlist__first(evlist);
         struct perf_sample *data;
         u64 sample_type;
         u16 size = 0;
  
-       first = list_entry(evlist->entries.next, struct perf_evsel, node);
-
         if (!first->attr.sample_id_all)
                 goto out;
  
@@ -729,11 +742,9 @@ out:
         return size;
  }
  
-bool perf_evlist__valid_sample_id_all(const struct perf_evlist *evlist)
+bool perf_evlist__valid_sample_id_all(struct perf_evlist *evlist)
  {
-       struct perf_evsel *pos, *first;
-
-       pos = first = list_entry(evlist->entries.next, struct perf_evsel, node);
+       struct perf_evsel *first = perf_evlist__first(evlist), *pos = first;
  
         list_for_each_entry_continue(pos, &evlist->entries, node) {
                 if (first->attr.sample_id_all != pos->attr.sample_id_all)
@@ -743,11 +754,9 @@ bool perf_evlist__valid_sample_id_all(const struct perf_evlist *evlist)
         return true;
  }
  
-bool perf_evlist__sample_id_all(const struct perf_evlist *evlist)
+bool perf_evlist__sample_id_all(struct perf_evlist *evlist)
  {
-       struct perf_evsel *first;
-
-       first = list_entry(evlist->entries.next, struct perf_evsel, node);
+       struct perf_evsel *first = perf_evlist__first(evlist);
         return first->attr.sample_id_all;
  }
  
@@ -757,21 +766,13 @@ void perf_evlist__set_selected(struct perf_evlist *evlist,
         evlist->selected = evsel;
  }
  
-int perf_evlist__open(struct perf_evlist *evlist, bool group)
+int perf_evlist__open(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel, *first;
+       struct perf_evsel *evsel;
         int err, ncpus, nthreads;
  
-       first = list_entry(evlist->entries.next, struct perf_evsel, node);
-
         list_for_each_entry(evsel, &evlist->entries, node) {
-               struct xyarray *group_fd = NULL;
-
-               if (group && evsel != first)
-                       group_fd = first->fd;
-
-               err = perf_evsel__open(evsel, evlist->cpus, evlist->threads,
-                                      group, group_fd);
+               err = perf_evsel__open(evsel, evlist->cpus, evlist->threads);
                 if (err < 0)
                         goto out_err;
         }
@@ -885,6 +886,6 @@ int perf_evlist__start_workload(struct perf_evlist *evlist)
  int perf_evlist__parse_sample(struct perf_evlist *evlist, union perf_event *event,
                               struct perf_sample *sample, bool swapped)
  {
-       struct perf_evsel *e = list_entry(evlist->entries.next, struct perf_evsel, node);
-       return perf_evsel__parse_sample(e, event, sample, swapped);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
+       return perf_evsel__parse_sample(evsel, event, sample, swapped);
  }
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h

index 528c1acd9298443ce37afee8a2f3be515f49adfe..2ed255792c6b3af3e3ec9c86e6e4e63b4893f6cc 100644 (file)
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -5,6 +5,7 @@
  #include <stdio.h>
  #include "../perf.h"
  #include "event.h"
+#include "evsel.h"
  #include "util.h"
  #include <unistd.h>
  
@@ -41,8 +42,6 @@ struct perf_evsel_str_handler {
         void       *handler;
  };
  
-struct perf_evsel;
-
  struct perf_evlist *perf_evlist__new(struct cpu_map *cpus,
                                      struct thread_map *threads);
  void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus,
@@ -85,7 +84,7 @@ struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id);
  
  union perf_event *perf_evlist__mmap_read(struct perf_evlist *self, int idx);
  
-int perf_evlist__open(struct perf_evlist *evlist, bool group);
+int perf_evlist__open(struct perf_evlist *evlist);
  
  void perf_evlist__config_attrs(struct perf_evlist *evlist,
                                struct perf_record_opts *opts);
@@ -118,18 +117,30 @@ int perf_evlist__create_maps(struct perf_evlist *evlist,
  void perf_evlist__delete_maps(struct perf_evlist *evlist);
  int perf_evlist__set_filters(struct perf_evlist *evlist);
  
-u64 perf_evlist__sample_type(const struct perf_evlist *evlist);
-bool perf_evlist__sample_id_all(const const struct perf_evlist *evlist);
-u16 perf_evlist__id_hdr_size(const struct perf_evlist *evlist);
+void __perf_evlist__set_leader(struct list_head *list);
+void perf_evlist__set_leader(struct perf_evlist *evlist);
+
+u64 perf_evlist__sample_type(struct perf_evlist *evlist);
+bool perf_evlist__sample_id_all(struct perf_evlist *evlist);
+u16 perf_evlist__id_hdr_size(struct perf_evlist *evlist);
  
  int perf_evlist__parse_sample(struct perf_evlist *evlist, union perf_event *event,
                               struct perf_sample *sample, bool swapped);
  
-bool perf_evlist__valid_sample_type(const struct perf_evlist *evlist);
-bool perf_evlist__valid_sample_id_all(const struct perf_evlist *evlist);
+bool perf_evlist__valid_sample_type(struct perf_evlist *evlist);
+bool perf_evlist__valid_sample_id_all(struct perf_evlist *evlist);
  
  void perf_evlist__splice_list_tail(struct perf_evlist *evlist,
                                    struct list_head *list,
                                    int nr_entries);
  
+static inline struct perf_evsel *perf_evlist__first(struct perf_evlist *evlist)
+{
+       return list_entry(evlist->entries.next, struct perf_evsel, node);
+}
+
+static inline struct perf_evsel *perf_evlist__last(struct perf_evlist *evlist)
+{
+       return list_entry(evlist->entries.prev, struct perf_evsel, node);
+}
  #endif /* __PERF_EVLIST_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c

index 2eaae140def26b5ef952d814e47a1a3179d8abbb..7ff3c8fb736cd3706144fea653140b98550dc8bc 100644 (file)
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -8,6 +8,7 @@
   */
  
  #include <byteswap.h>
+#include <linux/bitops.h>
  #include "asm/bug.h"
  #include "evsel.h"
  #include "evlist.h"
@@ -16,9 +17,10 @@
  #include "thread_map.h"
  #include "target.h"
  #include "../../../include/linux/hw_breakpoint.h"
+#include "../../include/linux/perf_event.h"
+#include "perf_regs.h"
  
  #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
-#define GROUP_FD(group_fd, cpu) (*(int *)xyarray__entry(group_fd, cpu, 0))
  
  static int __perf_evsel__sample_size(u64 sample_type)
  {
@@ -317,7 +319,8 @@ const char *perf_evsel__name(struct perf_evsel *evsel)
                 break;
  
         default:
-               scnprintf(bf, sizeof(bf), "%s", "unknown attr type");
+               scnprintf(bf, sizeof(bf), "unknown attr type: %d",
+                         evsel->attr.type);
                 break;
         }
  
@@ -367,9 +370,18 @@ void perf_evsel__config(struct perf_evsel *evsel, struct perf_record_opts *opts,
                 attr->mmap_data = track;
         }
  
-       if (opts->call_graph)
+       if (opts->call_graph) {
                 attr->sample_type       |= PERF_SAMPLE_CALLCHAIN;
  
+               if (opts->call_graph == CALLCHAIN_DWARF) {
+                       attr->sample_type |= PERF_SAMPLE_REGS_USER |
+                                            PERF_SAMPLE_STACK_USER;
+                       attr->sample_regs_user = PERF_REGS_MASK;
+                       attr->sample_stack_user = opts->stack_dump_size;
+                       attr->exclude_callchain_user = 1;
+               }
+       }
+
         if (perf_target__has_cpu(&opts->target))
                 attr->sample_type       |= PERF_SAMPLE_CPU;
  
@@ -481,6 +493,7 @@ void perf_evsel__delete(struct perf_evsel *evsel)
  {
         perf_evsel__exit(evsel);
         close_cgroup(evsel->cgrp);
+       free(evsel->group_name);
         free(evsel->name);
         free(evsel);
  }
@@ -556,9 +569,28 @@ int __perf_evsel__read(struct perf_evsel *evsel,
         return 0;
  }
  
+static int get_group_fd(struct perf_evsel *evsel, int cpu, int thread)
+{
+       struct perf_evsel *leader = evsel->leader;
+       int fd;
+
+       if (!leader)
+               return -1;
+
+       /*
+        * Leader must be already processed/open,
+        * if not it's a bug.
+        */
+       BUG_ON(!leader->fd);
+
+       fd = FD(leader, cpu, thread);
+       BUG_ON(fd == -1);
+
+       return fd;
+}
+
  static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
-                             struct thread_map *threads, bool group,
-                             struct xyarray *group_fds)
+                             struct thread_map *threads)
  {
         int cpu, thread;
         unsigned long flags = 0;
@@ -574,13 +606,15 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
         }
  
         for (cpu = 0; cpu < cpus->nr; cpu++) {
-               int group_fd = group_fds ? GROUP_FD(group_fds, cpu) : -1;
  
                 for (thread = 0; thread < threads->nr; thread++) {
+                       int group_fd;
  
                         if (!evsel->cgrp)
                                 pid = threads->map[thread];
  
+                       group_fd = get_group_fd(evsel, cpu, thread);
+
                         FD(evsel, cpu, thread) = sys_perf_event_open(&evsel->attr,
                                                                      pid,
                                                                      cpus->map[cpu],
@@ -589,9 +623,6 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
                                 err = -errno;
                                 goto out_close;
                         }
-
-                       if (group && group_fd == -1)
-                               group_fd = FD(evsel, cpu, thread);
                 }
         }
  
@@ -635,8 +666,7 @@ static struct {
  };
  
  int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
-                    struct thread_map *threads, bool group,
-                    struct xyarray *group_fd)
+                    struct thread_map *threads)
  {
         if (cpus == NULL) {
                 /* Work around old compiler warnings about strict aliasing */
@@ -646,23 +676,19 @@ int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
         if (threads == NULL)
                 threads = &empty_thread_map.map;
  
-       return __perf_evsel__open(evsel, cpus, threads, group, group_fd);
+       return __perf_evsel__open(evsel, cpus, threads);
  }
  
  int perf_evsel__open_per_cpu(struct perf_evsel *evsel,
-                            struct cpu_map *cpus, bool group,
-                            struct xyarray *group_fd)
+                            struct cpu_map *cpus)
  {
-       return __perf_evsel__open(evsel, cpus, &empty_thread_map.map, group,
-                                 group_fd);
+       return __perf_evsel__open(evsel, cpus, &empty_thread_map.map);
  }
  
  int perf_evsel__open_per_thread(struct perf_evsel *evsel,
-                               struct thread_map *threads, bool group,
-                               struct xyarray *group_fd)
+                               struct thread_map *threads)
  {
-       return __perf_evsel__open(evsel, &empty_cpu_map.map, threads, group,
-                                 group_fd);
+       return __perf_evsel__open(evsel, &empty_cpu_map.map, threads);
  }
  
  static int perf_event__parse_id_sample(const union perf_event *event, u64 type,
@@ -733,6 +759,7 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
                              struct perf_sample *data, bool swapped)
  {
         u64 type = evsel->attr.sample_type;
+       u64 regs_user = evsel->attr.sample_regs_user;
         const u64 *array;
  
         /*
@@ -869,6 +896,32 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
                 sz /= sizeof(u64);
                 array += sz;
         }
+
+       if (type & PERF_SAMPLE_REGS_USER) {
+               /* First u64 tells us if we have any regs in sample. */
+               u64 avail = *array++;
+
+               if (avail) {
+                       data->user_regs.regs = (u64 *)array;
+                       array += hweight_long(regs_user);
+               }
+       }
+
+       if (type & PERF_SAMPLE_STACK_USER) {
+               u64 size = *array++;
+
+               data->user_stack.offset = ((char *)(array - 1)
+                                         - (char *) event);
+
+               if (!size) {
+                       data->user_stack.size = 0;
+               } else {
+                       data->user_stack.data = (char *)array;
+                       array += size / sizeof(*array);
+                       data->user_stack.size = *array;
+               }
+       }
+
         return 0;
  }
  
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h

index b559929983bbefd2e991fdf26e338fe588570a66..94f6ba16747f111e6fbce69eebdd0660f9202c7e 100644 (file)
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -53,9 +53,10 @@ struct perf_evsel {
         u64                     *id;
         struct perf_counts      *counts;
         int                     idx;
-       int                     ids;
+       u32                     ids;
         struct hists            hists;
         char                    *name;
+       struct event_format     *tp_format;
         union {
                 void            *priv;
                 off_t           id_offset;
@@ -67,6 +68,10 @@ struct perf_evsel {
         } handler;
         unsigned int            sample_size;
         bool                    supported;
+       /* parse modifier helper */
+       int                     exclude_GH;
+       struct perf_evsel       *leader;
+       char                    *group_name;
  };
  
  struct cpu_map;
@@ -106,14 +111,11 @@ void perf_evsel__free_id(struct perf_evsel *evsel);
  void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
  
  int perf_evsel__open_per_cpu(struct perf_evsel *evsel,
-                            struct cpu_map *cpus, bool group,
-                            struct xyarray *group_fds);
+                            struct cpu_map *cpus);
  int perf_evsel__open_per_thread(struct perf_evsel *evsel,
-                               struct thread_map *threads, bool group,
-                               struct xyarray *group_fds);
+                               struct thread_map *threads);
  int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
-                    struct thread_map *threads, bool group,
-                    struct xyarray *group_fds);
+                    struct thread_map *threads);
  void perf_evsel__close(struct perf_evsel *evsel, int ncpus, int nthreads);
  
  #define perf_evsel__match(evsel, t, c)         \
@@ -182,4 +184,9 @@ void hists__init(struct hists *hists);
  
  int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
                              struct perf_sample *sample, bool swapped);
+
+static inline struct perf_evsel *perf_evsel__next(struct perf_evsel *evsel)
+{
+       return list_entry(evsel->node.next, struct perf_evsel, node);
+}
  #endif /* __PERF_EVSEL_H */
diff --git a/tools/perf/util/generate-cmdlist.sh b/tools/perf/util/generate-cmdlist.sh

index f06f6fd148f8e63b810a7095e403e6630f1ca7f8..389590c1ad21741b6eb8f6dac3db8c84551800db 100755 (executable)
--- a/tools/perf/util/generate-cmdlist.sh
+++ b/tools/perf/util/generate-cmdlist.sh
@@ -21,4 +21,19 @@ do
             p
       }' "Documentation/perf-$cmd.txt"
  done
+
+echo "#ifndef NO_LIBELF_SUPPORT"
+sed -n -e 's/^perf-\([^        ]*\)[   ].* full.*/\1/p' command-list.txt |
+sort |
+while read cmd
+do
+     sed -n '
+     /^NAME/,/perf-'"$cmd"'/H
+     ${
+            x
+            s/.*perf-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
+           p
+     }' "Documentation/perf-$cmd.txt"
+done
+echo "#endif /* NO_LIBELF_SUPPORT */"
  echo "};"
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c

index 74ea3c2f81382c2881d9eda1e1cfb451b2a138f7..9696e64c9dbda5ee729688a8e054237877bdc18e 100644 (file)
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -20,11 +20,12 @@
  #include "symbol.h"
  #include "debug.h"
  #include "cpumap.h"
+#include "pmu.h"
  
  static bool no_buildid_cache = false;
  
-static int event_count;
-static struct perf_trace_event_type *events;
+static int trace_event_count;
+static struct perf_trace_event_type *trace_events;
  
  static u32 header_argc;
  static const char **header_argv;
@@ -36,24 +37,24 @@ int perf_header__push_event(u64 id, const char *name)
         if (strlen(name) > MAX_EVENT_NAME)
                 pr_warning("Event %s will be truncated\n", name);
  
-       nevents = realloc(events, (event_count + 1) * sizeof(*events));
+       nevents = realloc(trace_events, (trace_event_count + 1) * sizeof(*trace_events));
         if (nevents == NULL)
                 return -ENOMEM;
-       events = nevents;
+       trace_events = nevents;
  
-       memset(&events[event_count], 0, sizeof(struct perf_trace_event_type));
-       events[event_count].event_id = id;
-       strncpy(events[event_count].name, name, MAX_EVENT_NAME - 1);
-       event_count++;
+       memset(&trace_events[trace_event_count], 0, sizeof(struct perf_trace_event_type));
+       trace_events[trace_event_count].event_id = id;
+       strncpy(trace_events[trace_event_count].name, name, MAX_EVENT_NAME - 1);
+       trace_event_count++;
         return 0;
  }
  
  char *perf_header__find_event(u64 id)
  {
         int i;
-       for (i = 0 ; i < event_count; i++) {
-               if (events[i].event_id == id)
-                       return events[i].name;
+       for (i = 0 ; i < trace_event_count; i++) {
+               if (trace_events[i].event_id == id)
+                       return trace_events[i].name;
         }
         return NULL;
  }
@@ -608,11 +609,11 @@ static int write_nrcpus(int fd, struct perf_header *h __used,
  static int write_event_desc(int fd, struct perf_header *h __used,
                             struct perf_evlist *evlist)
  {
-       struct perf_evsel *attr;
+       struct perf_evsel *evsel;
         u32 nre = 0, nri, sz;
         int ret;
  
-       list_for_each_entry(attr, &evlist->entries, node)
+       list_for_each_entry(evsel, &evlist->entries, node)
                 nre++;
  
         /*
@@ -625,14 +626,14 @@ static int write_event_desc(int fd, struct perf_header *h __used,
         /*
          * size of perf_event_attr struct
          */
-       sz = (u32)sizeof(attr->attr);
+       sz = (u32)sizeof(evsel->attr);
         ret = do_write(fd, &sz, sizeof(sz));
         if (ret < 0)
                 return ret;
  
-       list_for_each_entry(attr, &evlist->entries, node) {
+       list_for_each_entry(evsel, &evlist->entries, node) {
  
-               ret = do_write(fd, &attr->attr, sz);
+               ret = do_write(fd, &evsel->attr, sz);
                 if (ret < 0)
                         return ret;
                 /*
@@ -642,7 +643,7 @@ static int write_event_desc(int fd, struct perf_header *h __used,
                  * copy into an nri to be independent of the
                  * type of ids,
                  */
-               nri = attr->ids;
+               nri = evsel->ids;
                 ret = do_write(fd, &nri, sizeof(nri));
                 if (ret < 0)
                         return ret;
@@ -650,13 +651,13 @@ static int write_event_desc(int fd, struct perf_header *h __used,
                 /*
                  * write event string as passed on cmdline
                  */
-               ret = do_write_string(fd, perf_evsel__name(attr));
+               ret = do_write_string(fd, perf_evsel__name(evsel));
                 if (ret < 0)
                         return ret;
                 /*
                  * write unique ids for this event
                  */
-               ret = do_write(fd, attr->id, attr->ids * sizeof(u64));
+               ret = do_write(fd, evsel->id, evsel->ids * sizeof(u64));
                 if (ret < 0)
                         return ret;
         }
@@ -1003,6 +1004,45 @@ done:
         return ret;
  }
  
+/*
+ * File format:
+ *
+ * struct pmu_mappings {
+ *     u32     pmu_num;
+ *     struct pmu_map {
+ *             u32     type;
+ *             char    name[];
+ *     }[pmu_num];
+ * };
+ */
+
+static int write_pmu_mappings(int fd, struct perf_header *h __used,
+                             struct perf_evlist *evlist __used)
+{
+       struct perf_pmu *pmu = NULL;
+       off_t offset = lseek(fd, 0, SEEK_CUR);
+       __u32 pmu_num = 0;
+
+       /* write real pmu_num later */
+       do_write(fd, &pmu_num, sizeof(pmu_num));
+
+       while ((pmu = perf_pmu__scan(pmu))) {
+               if (!pmu->name)
+                       continue;
+               pmu_num++;
+               do_write(fd, &pmu->type, sizeof(pmu->type));
+               do_write_string(fd, pmu->name);
+       }
+
+       if (pwrite(fd, &pmu_num, sizeof(pmu_num), offset) != sizeof(pmu_num)) {
+               /* discard all */
+               lseek(fd, offset, SEEK_SET);
+               return -1;
+       }
+
+       return 0;
+}
+
  /*
   * default get_cpuid(): nothing gets recorded
   * actual implementation must be in arch/$(ARCH)/util/header.c
@@ -1148,12 +1188,29 @@ static void print_cpu_topology(struct perf_header *ph, int fd, FILE *fp)
         }
  }
  
-static void print_event_desc(struct perf_header *ph, int fd, FILE *fp)
+static void free_event_desc(struct perf_evsel *events)
  {
-       struct perf_event_attr attr;
-       uint64_t id;
+       struct perf_evsel *evsel;
+
+       if (!events)
+               return;
+
+       for (evsel = events; evsel->attr.size; evsel++) {
+               if (evsel->name)
+                       free(evsel->name);
+               if (evsel->id)
+                       free(evsel->id);
+       }
+
+       free(events);
+}
+
+static struct perf_evsel *
+read_event_desc(struct perf_header *ph, int fd)
+{
+       struct perf_evsel *evsel, *events = NULL;
+       u64 *id;
         void *buf = NULL;
-       char *str;
         u32 nre, sz, nr, i, j;
         ssize_t ret;
         size_t msz;
@@ -1173,18 +1230,22 @@ static void print_event_desc(struct perf_header *ph, int fd, FILE *fp)
         if (ph->needs_swap)
                 sz = bswap_32(sz);
  
-       memset(&attr, 0, sizeof(attr));
-
         /* buffer to hold on file attr struct */
         buf = malloc(sz);
         if (!buf)
                 goto error;
  
-       msz = sizeof(attr);
+       /* the last event terminates with evsel->attr.size == 0: */
+       events = calloc(nre + 1, sizeof(*events));
+       if (!events)
+               goto error;
+
+       msz = sizeof(evsel->attr);
         if (sz < msz)
                 msz = sz;
  
-       for (i = 0 ; i < nre; i++) {
+       for (i = 0, evsel = events; i < nre; evsel++, i++) {
+               evsel->idx = i;
  
                 /*
                  * must read entire on-file attr struct to
@@ -1197,7 +1258,7 @@ static void print_event_desc(struct perf_header *ph, int fd, FILE *fp)
                 if (ph->needs_swap)
                         perf_event__attr_swap(buf);
  
-               memcpy(&attr, buf, msz);
+               memcpy(&evsel->attr, buf, msz);
  
                 ret = read(fd, &nr, sizeof(nr));
                 if (ret != (ssize_t)sizeof(nr))
@@ -1206,51 +1267,82 @@ static void print_event_desc(struct perf_header *ph, int fd, FILE *fp)
                 if (ph->needs_swap)
                         nr = bswap_32(nr);
  
-               str = do_read_string(fd, ph);
-               fprintf(fp, "# event : name = %s, ", str);
-               free(str);
+               evsel->name = do_read_string(fd, ph);
+
+               if (!nr)
+                       continue;
+
+               id = calloc(nr, sizeof(*id));
+               if (!id)
+                       goto error;
+               evsel->ids = nr;
+               evsel->id = id;
+
+               for (j = 0 ; j < nr; j++) {
+                       ret = read(fd, id, sizeof(*id));
+                       if (ret != (ssize_t)sizeof(*id))
+                               goto error;
+                       if (ph->needs_swap)
+                               *id = bswap_64(*id);
+                       id++;
+               }
+       }
+out:
+       if (buf)
+               free(buf);
+       return events;
+error:
+       if (events)
+               free_event_desc(events);
+       events = NULL;
+       goto out;
+}
+
+static void print_event_desc(struct perf_header *ph, int fd, FILE *fp)
+{
+       struct perf_evsel *evsel, *events = read_event_desc(ph, fd);
+       u32 j;
+       u64 *id;
+
+       if (!events) {
+               fprintf(fp, "# event desc: not available or unable to read\n");
+               return;
+       }
+
+       for (evsel = events; evsel->attr.size; evsel++) {
+               fprintf(fp, "# event : name = %s, ", evsel->name);
  
                 fprintf(fp, "type = %d, config = 0x%"PRIx64
                             ", config1 = 0x%"PRIx64", config2 = 0x%"PRIx64,
-                               attr.type,
-                               (u64)attr.config,
-                               (u64)attr.config1,
-                               (u64)attr.config2);
+                               evsel->attr.type,
+                               (u64)evsel->attr.config,
+                               (u64)evsel->attr.config1,
+                               (u64)evsel->attr.config2);
  
                 fprintf(fp, ", excl_usr = %d, excl_kern = %d",
-                               attr.exclude_user,
-                               attr.exclude_kernel);
+                               evsel->attr.exclude_user,
+                               evsel->attr.exclude_kernel);
  
                 fprintf(fp, ", excl_host = %d, excl_guest = %d",
-                               attr.exclude_host,
-                               attr.exclude_guest);
+                               evsel->attr.exclude_host,
+                               evsel->attr.exclude_guest);
  
-               fprintf(fp, ", precise_ip = %d", attr.precise_ip);
+               fprintf(fp, ", precise_ip = %d", evsel->attr.precise_ip);
  
-               if (nr)
+               if (evsel->ids) {
                         fprintf(fp, ", id = {");
-
-               for (j = 0 ; j < nr; j++) {
-                       ret = read(fd, &id, sizeof(id));
-                       if (ret != (ssize_t)sizeof(id))
-                               goto error;
-
-                       if (ph->needs_swap)
-                               id = bswap_64(id);
-
-                       if (j)
-                               fputc(',', fp);
-
-                       fprintf(fp, " %"PRIu64, id);
-               }
-               if (nr && j == nr)
+                       for (j = 0, id = evsel->id; j < evsel->ids; j++, id++) {
+                               if (j)
+                                       fputc(',', fp);
+                               fprintf(fp, " %"PRIu64, *id);
+                       }
                         fprintf(fp, " }");
+               }
+
                 fputc('\n', fp);
         }
-       free(buf);
-       return;
-error:
-       fprintf(fp, "# event desc: not available or unable to read\n");
+
+       free_event_desc(events);
  }
  
  static void print_total_mem(struct perf_header *h __used, int fd, FILE *fp)
@@ -1337,6 +1429,43 @@ static void print_branch_stack(struct perf_header *ph __used, int fd __used,
         fprintf(fp, "# contains samples with branch stack\n");
  }
  
+static void print_pmu_mappings(struct perf_header *ph, int fd, FILE *fp)
+{
+       const char *delimiter = "# pmu mappings: ";
+       char *name;
+       int ret;
+       u32 pmu_num;
+       u32 type;
+
+       ret = read(fd, &pmu_num, sizeof(pmu_num));
+       if (ret != sizeof(pmu_num))
+               goto error;
+
+       if (!pmu_num) {
+               fprintf(fp, "# pmu mappings: not available\n");
+               return;
+       }
+
+       while (pmu_num) {
+               if (read(fd, &type, sizeof(type)) != sizeof(type))
+                       break;
+               name = do_read_string(fd, ph);
+               if (!name)
+                       break;
+               pmu_num--;
+               fprintf(fp, "%s%s = %" PRIu32, delimiter, name, type);
+               free(name);
+               delimiter = ", ";
+       }
+
+       fprintf(fp, "\n");
+
+       if (!pmu_num)
+               return;
+error:
+       fprintf(fp, "# pmu mappings: unable to read\n");
+}
+
  static int __event_process_build_id(struct build_id_event *bev,
                                     char *filename,
                                     struct perf_session *session)
@@ -1504,6 +1633,56 @@ static int process_build_id(struct perf_file_section *section,
         return 0;
  }
  
+static struct perf_evsel *
+perf_evlist__find_by_index(struct perf_evlist *evlist, int idx)
+{
+       struct perf_evsel *evsel;
+
+       list_for_each_entry(evsel, &evlist->entries, node) {
+               if (evsel->idx == idx)
+                       return evsel;
+       }
+
+       return NULL;
+}
+
+static void
+perf_evlist__set_event_name(struct perf_evlist *evlist, struct perf_evsel *event)
+{
+       struct perf_evsel *evsel;
+
+       if (!event->name)
+               return;
+
+       evsel = perf_evlist__find_by_index(evlist, event->idx);
+       if (!evsel)
+               return;
+
+       if (evsel->name)
+               return;
+
+       evsel->name = strdup(event->name);
+}
+
+static int
+process_event_desc(struct perf_file_section *section __unused,
+                  struct perf_header *header, int feat __unused, int fd,
+                  void *data __used)
+{
+       struct perf_session *session = container_of(header, struct perf_session, header);
+       struct perf_evsel *evsel, *events = read_event_desc(header, fd);
+
+       if (!events)
+               return 0;
+
+       for (evsel = events; evsel->attr.size; evsel++)
+               perf_evlist__set_event_name(session->evlist, evsel);
+
+       free_event_desc(events);
+
+       return 0;
+}
+
  struct feature_ops {
         int (*write)(int fd, struct perf_header *h, struct perf_evlist *evlist);
         void (*print)(struct perf_header *h, int fd, FILE *fp);
@@ -1537,11 +1716,12 @@ static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = {
         FEAT_OPA(HEADER_CPUDESC,        cpudesc),
         FEAT_OPA(HEADER_CPUID,          cpuid),
         FEAT_OPA(HEADER_TOTAL_MEM,      total_mem),
-       FEAT_OPA(HEADER_EVENT_DESC,     event_desc),
+       FEAT_OPP(HEADER_EVENT_DESC,     event_desc),
         FEAT_OPA(HEADER_CMDLINE,        cmdline),
         FEAT_OPF(HEADER_CPU_TOPOLOGY,   cpu_topology),
         FEAT_OPF(HEADER_NUMA_TOPOLOGY,  numa_topology),
         FEAT_OPA(HEADER_BRANCH_STACK,   branch_stack),
+       FEAT_OPA(HEADER_PMU_MAPPINGS,   pmu_mappings),
  };
  
  struct header_print_data {
@@ -1683,17 +1863,17 @@ int perf_session__write_header(struct perf_session *session,
         struct perf_file_header f_header;
         struct perf_file_attr   f_attr;
         struct perf_header *header = &session->header;
-       struct perf_evsel *attr, *pair = NULL;
+       struct perf_evsel *evsel, *pair = NULL;
         int err;
  
         lseek(fd, sizeof(f_header), SEEK_SET);
  
         if (session->evlist != evlist)
-               pair = list_entry(session->evlist->entries.next, struct perf_evsel, node);
+               pair = perf_evlist__first(session->evlist);
  
-       list_for_each_entry(attr, &evlist->entries, node) {
-               attr->id_offset = lseek(fd, 0, SEEK_CUR);
-               err = do_write(fd, attr->id, attr->ids * sizeof(u64));
+       list_for_each_entry(evsel, &evlist->entries, node) {
+               evsel->id_offset = lseek(fd, 0, SEEK_CUR);
+               err = do_write(fd, evsel->id, evsel->ids * sizeof(u64));
                 if (err < 0) {
  out_err_write:
                         pr_debug("failed to write perf header\n");
@@ -1703,19 +1883,19 @@ out_err_write:
                         err = do_write(fd, pair->id, pair->ids * sizeof(u64));
                         if (err < 0)
                                 goto out_err_write;
-                       attr->ids += pair->ids;
-                       pair = list_entry(pair->node.next, struct perf_evsel, node);
+                       evsel->ids += pair->ids;
+                       pair = perf_evsel__next(pair);
                 }
         }
  
         header->attr_offset = lseek(fd, 0, SEEK_CUR);
  
-       list_for_each_entry(attr, &evlist->entries, node) {
+       list_for_each_entry(evsel, &evlist->entries, node) {
                 f_attr = (struct perf_file_attr){
-                       .attr = attr->attr,
+                       .attr = evsel->attr,
                         .ids  = {
-                               .offset = attr->id_offset,
-                               .size   = attr->ids * sizeof(u64),
+                               .offset = evsel->id_offset,
+                               .size   = evsel->ids * sizeof(u64),
                         }
                 };
                 err = do_write(fd, &f_attr, sizeof(f_attr));
@@ -1726,9 +1906,9 @@ out_err_write:
         }
  
         header->event_offset = lseek(fd, 0, SEEK_CUR);
-       header->event_size = event_count * sizeof(struct perf_trace_event_type);
-       if (events) {
-               err = do_write(fd, events, header->event_size);
+       header->event_size = trace_event_count * sizeof(struct perf_trace_event_type);
+       if (trace_events) {
+               err = do_write(fd, trace_events, header->event_size);
                 if (err < 0) {
                         pr_debug("failed to write perf header events\n");
                         return err;
@@ -1829,6 +2009,8 @@ out_free:
  static const int attr_file_abi_sizes[] = {
         [0] = PERF_ATTR_SIZE_VER0,
         [1] = PERF_ATTR_SIZE_VER1,
+       [2] = PERF_ATTR_SIZE_VER2,
+       [3] = PERF_ATTR_SIZE_VER3,
         0,
  };
  
@@ -2123,6 +2305,7 @@ static int perf_evsel__set_tracepoint_name(struct perf_evsel *evsel,
         if (event->name == NULL)
                 return -1;
  
+       evsel->tp_format = event;
         return 0;
  }
  
@@ -2207,13 +2390,13 @@ int perf_session__read_header(struct perf_session *session, int fd)
  
         if (f_header.event_types.size) {
                 lseek(fd, f_header.event_types.offset, SEEK_SET);
-               events = malloc(f_header.event_types.size);
-               if (events == NULL)
+               trace_events = malloc(f_header.event_types.size);
+               if (trace_events == NULL)
                         return -ENOMEM;
-               if (perf_header__getbuffer64(header, fd, events,
+               if (perf_header__getbuffer64(header, fd, trace_events,
                                              f_header.event_types.size))
                         goto out_errno;
-               event_count =  f_header.event_types.size / sizeof(struct perf_trace_event_type);
+               trace_event_count =  f_header.event_types.size / sizeof(struct perf_trace_event_type);
         }
  
         perf_header__process_sections(header, fd, &session->pevent,
@@ -2236,7 +2419,7 @@ out_delete_evlist:
  }
  
  int perf_event__synthesize_attr(struct perf_tool *tool,
-                               struct perf_event_attr *attr, u16 ids, u64 *id,
+                               struct perf_event_attr *attr, u32 ids, u64 *id,
                                 perf_event__handler_t process)
  {
         union perf_event *ev;
@@ -2257,9 +2440,12 @@ int perf_event__synthesize_attr(struct perf_tool *tool,
         memcpy(ev->attr.id, id, ids * sizeof(u64));
  
         ev->attr.header.type = PERF_RECORD_HEADER_ATTR;
-       ev->attr.header.size = size;
+       ev->attr.header.size = (u16)size;
  
-       err = process(tool, ev, NULL, NULL);
+       if (ev->attr.header.size == size)
+               err = process(tool, ev, NULL, NULL);
+       else
+               err = -E2BIG;
  
         free(ev);
  
@@ -2270,12 +2456,12 @@ int perf_event__synthesize_attrs(struct perf_tool *tool,
                                    struct perf_session *session,
                                    perf_event__handler_t process)
  {
-       struct perf_evsel *attr;
+       struct perf_evsel *evsel;
         int err = 0;
  
-       list_for_each_entry(attr, &session->evlist->entries, node) {
-               err = perf_event__synthesize_attr(tool, &attr->attr, attr->ids,
-                                                 attr->id, process);
+       list_for_each_entry(evsel, &session->evlist->entries, node) {
+               err = perf_event__synthesize_attr(tool, &evsel->attr, evsel->ids,
+                                                 evsel->id, process);
                 if (err) {
                         pr_debug("failed to create perf header attribute\n");
                         return err;
@@ -2288,7 +2474,7 @@ int perf_event__synthesize_attrs(struct perf_tool *tool,
  int perf_event__process_attr(union perf_event *event,
                              struct perf_evlist **pevlist)
  {
-       unsigned int i, ids, n_ids;
+       u32 i, ids, n_ids;
         struct perf_evsel *evsel;
         struct perf_evlist *evlist = *pevlist;
  
@@ -2355,8 +2541,8 @@ int perf_event__synthesize_event_types(struct perf_tool *tool,
         struct perf_trace_event_type *type;
         int i, err = 0;
  
-       for (i = 0; i < event_count; i++) {
-               type = &events[i];
+       for (i = 0; i < trace_event_count; i++) {
+               type = &trace_events[i];
  
                 err = perf_event__synthesize_event_type(tool, type->event_id,
                                                         type->name, process,
@@ -2452,6 +2638,8 @@ int perf_event__process_tracing_data(union perf_event *event,
         if (size_read + padding != size)
                 die("tracing data size mismatch");
  
+       perf_evlist__set_tracepoint_names(session->evlist, session->pevent);
+
         return size_read + padding;
  }
  
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h

index 2d42b3e1826ff998534b3f949b48fc7c902bc0a1..9d5eedceda72bb14186e34c6926ac624027d3409 100644 (file)
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -28,6 +28,7 @@ enum {
         HEADER_CPU_TOPOLOGY,
         HEADER_NUMA_TOPOLOGY,
         HEADER_BRANCH_STACK,
+       HEADER_PMU_MAPPINGS,
         HEADER_LAST_FEATURE,
         HEADER_FEAT_BITS        = 256,
  };
@@ -99,7 +100,7 @@ int build_id_cache__add_s(const char *sbuild_id, const char *debugdir,
  int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir);
  
  int perf_event__synthesize_attr(struct perf_tool *tool,
-                               struct perf_event_attr *attr, u16 ids, u64 *id,
+                               struct perf_event_attr *attr, u32 ids, u64 *id,
                                 perf_event__handler_t process);
  int perf_event__synthesize_attrs(struct perf_tool *tool,
                                  struct perf_session *session,
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c

index f247ef2789a4d77ffe4ee1e9c9d981d065775c3f..b1817f15bb87340afdf32a2835d3e288ec88acc8 100644 (file)
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -45,7 +45,7 @@ bool hists__new_col_len(struct hists *hists, enum hist_column col, u16 len)
         return false;
  }
  
-static void hists__reset_col_len(struct hists *hists)
+void hists__reset_col_len(struct hists *hists)
  {
         enum hist_column col;
  
@@ -63,7 +63,7 @@ static void hists__set_unres_dso_col_len(struct hists *hists, int dso)
                 hists__set_col_len(hists, dso, unresolved_col_width);
  }
  
-static void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
+void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
  {
         const unsigned int unresolved_col_width = BITS_PER_LONG / 4;
         u16 len;
@@ -114,6 +114,22 @@ static void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
         }
  }
  
+void hists__output_recalc_col_len(struct hists *hists, int max_rows)
+{
+       struct rb_node *next = rb_first(&hists->entries);
+       struct hist_entry *n;
+       int row = 0;
+
+       hists__reset_col_len(hists);
+
+       while (next && row++ < max_rows) {
+               n = rb_entry(next, struct hist_entry, rb_node);
+               if (!n->filtered)
+                       hists__calc_col_len(hists, n);
+               next = rb_next(&n->rb_node);
+       }
+}
+
  static void hist_entry__add_cpumode_period(struct hist_entry *he,
                                            unsigned int cpumode, u64 period)
  {
@@ -547,641 +563,6 @@ void hists__output_resort_threaded(struct hists *hists)
         return __hists__output_resort(hists, true);
  }
  
-static size_t callchain__fprintf_left_margin(FILE *fp, int left_margin)
-{
-       int i;
-       int ret = fprintf(fp, "            ");
-
-       for (i = 0; i < left_margin; i++)
-               ret += fprintf(fp, " ");
-
-       return ret;
-}
-
-static size_t ipchain__fprintf_graph_line(FILE *fp, int depth, int depth_mask,
-                                         int left_margin)
-{
-       int i;
-       size_t ret = callchain__fprintf_left_margin(fp, left_margin);
-
-       for (i = 0; i < depth; i++)
-               if (depth_mask & (1 << i))
-                       ret += fprintf(fp, "|          ");
-               else
-                       ret += fprintf(fp, "           ");
-
-       ret += fprintf(fp, "\n");
-
-       return ret;
-}
-
-static size_t ipchain__fprintf_graph(FILE *fp, struct callchain_list *chain,
-                                    int depth, int depth_mask, int period,
-                                    u64 total_samples, u64 hits,
-                                    int left_margin)
-{
-       int i;
-       size_t ret = 0;
-
-       ret += callchain__fprintf_left_margin(fp, left_margin);
-       for (i = 0; i < depth; i++) {
-               if (depth_mask & (1 << i))
-                       ret += fprintf(fp, "|");
-               else
-                       ret += fprintf(fp, " ");
-               if (!period && i == depth - 1) {
-                       double percent;
-
-                       percent = hits * 100.0 / total_samples;
-                       ret += percent_color_fprintf(fp, "--%2.2f%%-- ", percent);
-               } else
-                       ret += fprintf(fp, "%s", "          ");
-       }
-       if (chain->ms.sym)
-               ret += fprintf(fp, "%s\n", chain->ms.sym->name);
-       else
-               ret += fprintf(fp, "0x%0" PRIx64 "\n", chain->ip);
-
-       return ret;
-}
-
-static struct symbol *rem_sq_bracket;
-static struct callchain_list rem_hits;
-
-static void init_rem_hits(void)
-{
-       rem_sq_bracket = malloc(sizeof(*rem_sq_bracket) + 6);
-       if (!rem_sq_bracket) {
-               fprintf(stderr, "Not enough memory to display remaining hits\n");
-               return;
-       }
-
-       strcpy(rem_sq_bracket->name, "[...]");
-       rem_hits.ms.sym = rem_sq_bracket;
-}
-
-static size_t __callchain__fprintf_graph(FILE *fp, struct rb_root *root,
-                                        u64 total_samples, int depth,
-                                        int depth_mask, int left_margin)
-{
-       struct rb_node *node, *next;
-       struct callchain_node *child;
-       struct callchain_list *chain;
-       int new_depth_mask = depth_mask;
-       u64 remaining;
-       size_t ret = 0;
-       int i;
-       uint entries_printed = 0;
-
-       remaining = total_samples;
-
-       node = rb_first(root);
-       while (node) {
-               u64 new_total;
-               u64 cumul;
-
-               child = rb_entry(node, struct callchain_node, rb_node);
-               cumul = callchain_cumul_hits(child);
-               remaining -= cumul;
-
-               /*
-                * The depth mask manages the output of pipes that show
-                * the depth. We don't want to keep the pipes of the current
-                * level for the last child of this depth.
-                * Except if we have remaining filtered hits. They will
-                * supersede the last child
-                */
-               next = rb_next(node);
-               if (!next && (callchain_param.mode != CHAIN_GRAPH_REL || !remaining))
-                       new_depth_mask &= ~(1 << (depth - 1));
-
-               /*
-                * But we keep the older depth mask for the line separator
-                * to keep the level link until we reach the last child
-                */
-               ret += ipchain__fprintf_graph_line(fp, depth, depth_mask,
-                                                  left_margin);
-               i = 0;
-               list_for_each_entry(chain, &child->val, list) {
-                       ret += ipchain__fprintf_graph(fp, chain, depth,
-                                                     new_depth_mask, i++,
-                                                     total_samples,
-                                                     cumul,
-                                                     left_margin);
-               }
-
-               if (callchain_param.mode == CHAIN_GRAPH_REL)
-                       new_total = child->children_hit;
-               else
-                       new_total = total_samples;
-
-               ret += __callchain__fprintf_graph(fp, &child->rb_root, new_total,
-                                                 depth + 1,
-                                                 new_depth_mask | (1 << depth),
-                                                 left_margin);
-               node = next;
-               if (++entries_printed == callchain_param.print_limit)
-                       break;
-       }
-
-       if (callchain_param.mode == CHAIN_GRAPH_REL &&
-               remaining && remaining != total_samples) {
-
-               if (!rem_sq_bracket)
-                       return ret;
-
-               new_depth_mask &= ~(1 << (depth - 1));
-               ret += ipchain__fprintf_graph(fp, &rem_hits, depth,
-                                             new_depth_mask, 0, total_samples,
-                                             remaining, left_margin);
-       }
-
-       return ret;
-}
-
-static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
-                                      u64 total_samples, int left_margin)
-{
-       struct callchain_node *cnode;
-       struct callchain_list *chain;
-       u32 entries_printed = 0;
-       bool printed = false;
-       struct rb_node *node;
-       int i = 0;
-       int ret = 0;
-
-       /*
-        * If have one single callchain root, don't bother printing
-        * its percentage (100 % in fractal mode and the same percentage
-        * than the hist in graph mode). This also avoid one level of column.
-        */
-       node = rb_first(root);
-       if (node && !rb_next(node)) {
-               cnode = rb_entry(node, struct callchain_node, rb_node);
-               list_for_each_entry(chain, &cnode->val, list) {
-                       /*
-                        * If we sort by symbol, the first entry is the same than
-                        * the symbol. No need to print it otherwise it appears as
-                        * displayed twice.
-                        */
-                       if (!i++ && sort__first_dimension == SORT_SYM)
-                               continue;
-                       if (!printed) {
-                               ret += callchain__fprintf_left_margin(fp, left_margin);
-                               ret += fprintf(fp, "|\n");
-                               ret += callchain__fprintf_left_margin(fp, left_margin);
-                               ret += fprintf(fp, "---");
-                               left_margin += 3;
-                               printed = true;
-                       } else
-                               ret += callchain__fprintf_left_margin(fp, left_margin);
-
-                       if (chain->ms.sym)
-                               ret += fprintf(fp, " %s\n", chain->ms.sym->name);
-                       else
-                               ret += fprintf(fp, " %p\n", (void *)(long)chain->ip);
-
-                       if (++entries_printed == callchain_param.print_limit)
-                               break;
-               }
-               root = &cnode->rb_root;
-       }
-
-       ret += __callchain__fprintf_graph(fp, root, total_samples,
-                                         1, 1, left_margin);
-       ret += fprintf(fp, "\n");
-
-       return ret;
-}
-
-static size_t __callchain__fprintf_flat(FILE *fp,
-                                       struct callchain_node *self,
-                                       u64 total_samples)
-{
-       struct callchain_list *chain;
-       size_t ret = 0;
-
-       if (!self)
-               return 0;
-
-       ret += __callchain__fprintf_flat(fp, self->parent, total_samples);
-
-
-       list_for_each_entry(chain, &self->val, list) {
-               if (chain->ip >= PERF_CONTEXT_MAX)
-                       continue;
-               if (chain->ms.sym)
-                       ret += fprintf(fp, "                %s\n", chain->ms.sym->name);
-               else
-                       ret += fprintf(fp, "                %p\n",
-                                       (void *)(long)chain->ip);
-       }
-
-       return ret;
-}
-
-static size_t callchain__fprintf_flat(FILE *fp, struct rb_root *self,
-                                     u64 total_samples)
-{
-       size_t ret = 0;
-       u32 entries_printed = 0;
-       struct rb_node *rb_node;
-       struct callchain_node *chain;
-
-       rb_node = rb_first(self);
-       while (rb_node) {
-               double percent;
-
-               chain = rb_entry(rb_node, struct callchain_node, rb_node);
-               percent = chain->hit * 100.0 / total_samples;
-
-               ret = percent_color_fprintf(fp, "           %6.2f%%\n", percent);
-               ret += __callchain__fprintf_flat(fp, chain, total_samples);
-               ret += fprintf(fp, "\n");
-               if (++entries_printed == callchain_param.print_limit)
-                       break;
-
-               rb_node = rb_next(rb_node);
-       }
-
-       return ret;
-}
-
-static size_t hist_entry_callchain__fprintf(struct hist_entry *he,
-                                           u64 total_samples, int left_margin,
-                                           FILE *fp)
-{
-       switch (callchain_param.mode) {
-       case CHAIN_GRAPH_REL:
-               return callchain__fprintf_graph(fp, &he->sorted_chain, he->period,
-                                               left_margin);
-               break;
-       case CHAIN_GRAPH_ABS:
-               return callchain__fprintf_graph(fp, &he->sorted_chain, total_samples,
-                                               left_margin);
-               break;
-       case CHAIN_FLAT:
-               return callchain__fprintf_flat(fp, &he->sorted_chain, total_samples);
-               break;
-       case CHAIN_NONE:
-               break;
-       default:
-               pr_err("Bad callchain mode\n");
-       }
-
-       return 0;
-}
-
-void hists__output_recalc_col_len(struct hists *hists, int max_rows)
-{
-       struct rb_node *next = rb_first(&hists->entries);
-       struct hist_entry *n;
-       int row = 0;
-
-       hists__reset_col_len(hists);
-
-       while (next && row++ < max_rows) {
-               n = rb_entry(next, struct hist_entry, rb_node);
-               if (!n->filtered)
-                       hists__calc_col_len(hists, n);
-               next = rb_next(&n->rb_node);
-       }
-}
-
-static int hist_entry__pcnt_snprintf(struct hist_entry *he, char *s,
-                                    size_t size, struct hists *pair_hists,
-                                    bool show_displacement, long displacement,
-                                    bool color, u64 total_period)
-{
-       u64 period, total, period_sys, period_us, period_guest_sys, period_guest_us;
-       u64 nr_events;
-       const char *sep = symbol_conf.field_sep;
-       int ret;
-
-       if (symbol_conf.exclude_other && !he->parent)
-               return 0;
-
-       if (pair_hists) {
-               period = he->pair ? he->pair->period : 0;
-               nr_events = he->pair ? he->pair->nr_events : 0;
-               total = pair_hists->stats.total_period;
-               period_sys = he->pair ? he->pair->period_sys : 0;
-               period_us = he->pair ? he->pair->period_us : 0;
-               period_guest_sys = he->pair ? he->pair->period_guest_sys : 0;
-               period_guest_us = he->pair ? he->pair->period_guest_us : 0;
-       } else {
-               period = he->period;
-               nr_events = he->nr_events;
-               total = total_period;
-               period_sys = he->period_sys;
-               period_us = he->period_us;
-               period_guest_sys = he->period_guest_sys;
-               period_guest_us = he->period_guest_us;
-       }
-
-       if (total) {
-               if (color)
-                       ret = percent_color_snprintf(s, size,
-                                                    sep ? "%.2f" : "   %6.2f%%",
-                                                    (period * 100.0) / total);
-               else
-                       ret = scnprintf(s, size, sep ? "%.2f" : "   %6.2f%%",
-                                      (period * 100.0) / total);
-               if (symbol_conf.show_cpu_utilization) {
-                       ret += percent_color_snprintf(s + ret, size - ret,
-                                       sep ? "%.2f" : "   %6.2f%%",
-                                       (period_sys * 100.0) / total);
-                       ret += percent_color_snprintf(s + ret, size - ret,
-                                       sep ? "%.2f" : "   %6.2f%%",
-                                       (period_us * 100.0) / total);
-                       if (perf_guest) {
-                               ret += percent_color_snprintf(s + ret,
-                                               size - ret,
-                                               sep ? "%.2f" : "   %6.2f%%",
-                                               (period_guest_sys * 100.0) /
-                                                               total);
-                               ret += percent_color_snprintf(s + ret,
-                                               size - ret,
-                                               sep ? "%.2f" : "   %6.2f%%",
-                                               (period_guest_us * 100.0) /
-                                                               total);
-                       }
-               }
-       } else
-               ret = scnprintf(s, size, sep ? "%" PRIu64 : "%12" PRIu64 " ", period);
-
-       if (symbol_conf.show_nr_samples) {
-               if (sep)
-                       ret += scnprintf(s + ret, size - ret, "%c%" PRIu64, *sep, nr_events);
-               else
-                       ret += scnprintf(s + ret, size - ret, "%11" PRIu64, nr_events);
-       }
-
-       if (symbol_conf.show_total_period) {
-               if (sep)
-                       ret += scnprintf(s + ret, size - ret, "%c%" PRIu64, *sep, period);
-               else
-                       ret += scnprintf(s + ret, size - ret, " %12" PRIu64, period);
-       }
-
-       if (pair_hists) {
-               char bf[32];
-               double old_percent = 0, new_percent = 0, diff;
-
-               if (total > 0)
-                       old_percent = (period * 100.0) / total;
-               if (total_period > 0)
-                       new_percent = (he->period * 100.0) / total_period;
-
-               diff = new_percent - old_percent;
-
-               if (fabs(diff) >= 0.01)
-                       scnprintf(bf, sizeof(bf), "%+4.2F%%", diff);
-               else
-                       scnprintf(bf, sizeof(bf), " ");
-
-               if (sep)
-                       ret += scnprintf(s + ret, size - ret, "%c%s", *sep, bf);
-               else
-                       ret += scnprintf(s + ret, size - ret, "%11.11s", bf);
-
-               if (show_displacement) {
-                       if (displacement)
-                               scnprintf(bf, sizeof(bf), "%+4ld", displacement);
-                       else
-                               scnprintf(bf, sizeof(bf), " ");
-
-                       if (sep)
-                               ret += scnprintf(s + ret, size - ret, "%c%s", *sep, bf);
-                       else
-                               ret += scnprintf(s + ret, size - ret, "%6.6s", bf);
-               }
-       }
-
-       return ret;
-}
-
-int hist_entry__snprintf(struct hist_entry *he, char *s, size_t size,
-                        struct hists *hists)
-{
-       const char *sep = symbol_conf.field_sep;
-       struct sort_entry *se;
-       int ret = 0;
-
-       list_for_each_entry(se, &hist_entry__sort_list, list) {
-               if (se->elide)
-                       continue;
-
-               ret += scnprintf(s + ret, size - ret, "%s", sep ?: "  ");
-               ret += se->se_snprintf(he, s + ret, size - ret,
-                                      hists__col_len(hists, se->se_width_idx));
-       }
-
-       return ret;
-}
-
-static int hist_entry__fprintf(struct hist_entry *he, size_t size,
-                              struct hists *hists, struct hists *pair_hists,
-                              bool show_displacement, long displacement,
-                              u64 total_period, FILE *fp)
-{
-       char bf[512];
-       int ret;
-
-       if (size == 0 || size > sizeof(bf))
-               size = sizeof(bf);
-
-       ret = hist_entry__pcnt_snprintf(he, bf, size, pair_hists,
-                                       show_displacement, displacement,
-                                       true, total_period);
-       hist_entry__snprintf(he, bf + ret, size - ret, hists);
-       return fprintf(fp, "%s\n", bf);
-}
-
-static size_t hist_entry__fprintf_callchain(struct hist_entry *he,
-                                           struct hists *hists,
-                                           u64 total_period, FILE *fp)
-{
-       int left_margin = 0;
-
-       if (sort__first_dimension == SORT_COMM) {
-               struct sort_entry *se = list_first_entry(&hist_entry__sort_list,
-                                                        typeof(*se), list);
-               left_margin = hists__col_len(hists, se->se_width_idx);
-               left_margin -= thread__comm_len(he->thread);
-       }
-
-       return hist_entry_callchain__fprintf(he, total_period, left_margin, fp);
-}
-
-size_t hists__fprintf(struct hists *hists, struct hists *pair,
-                     bool show_displacement, bool show_header, int max_rows,
-                     int max_cols, FILE *fp)
-{
-       struct sort_entry *se;
-       struct rb_node *nd;
-       size_t ret = 0;
-       u64 total_period;
-       unsigned long position = 1;
-       long displacement = 0;
-       unsigned int width;
-       const char *sep = symbol_conf.field_sep;
-       const char *col_width = symbol_conf.col_width_list_str;
-       int nr_rows = 0;
-
-       init_rem_hits();
-
-       if (!show_header)
-               goto print_entries;
-
-       fprintf(fp, "# %s", pair ? "Baseline" : "Overhead");
-
-       if (symbol_conf.show_cpu_utilization) {
-               if (sep) {
-                       ret += fprintf(fp, "%csys", *sep);
-                       ret += fprintf(fp, "%cus", *sep);
-                       if (perf_guest) {
-                               ret += fprintf(fp, "%cguest sys", *sep);
-                               ret += fprintf(fp, "%cguest us", *sep);
-                       }
-               } else {
-                       ret += fprintf(fp, "     sys  ");
-                       ret += fprintf(fp, "      us  ");
-                       if (perf_guest) {
-                               ret += fprintf(fp, "  guest sys  ");
-                               ret += fprintf(fp, "  guest us  ");
-                       }
-               }
-       }
-
-       if (symbol_conf.show_nr_samples) {
-               if (sep)
-                       fprintf(fp, "%cSamples", *sep);
-               else
-                       fputs("  Samples  ", fp);
-       }
-
-       if (symbol_conf.show_total_period) {
-               if (sep)
-                       ret += fprintf(fp, "%cPeriod", *sep);
-               else
-                       ret += fprintf(fp, "   Period    ");
-       }
-
-       if (pair) {
-               if (sep)
-                       ret += fprintf(fp, "%cDelta", *sep);
-               else
-                       ret += fprintf(fp, "  Delta    ");
-
-               if (show_displacement) {
-                       if (sep)
-                               ret += fprintf(fp, "%cDisplacement", *sep);
-                       else
-                               ret += fprintf(fp, " Displ");
-               }
-       }
-
-       list_for_each_entry(se, &hist_entry__sort_list, list) {
-               if (se->elide)
-                       continue;
-               if (sep) {
-                       fprintf(fp, "%c%s", *sep, se->se_header);
-                       continue;
-               }
-               width = strlen(se->se_header);
-               if (symbol_conf.col_width_list_str) {
-                       if (col_width) {
-                               hists__set_col_len(hists, se->se_width_idx,
-                                                  atoi(col_width));
-                               col_width = strchr(col_width, ',');
-                               if (col_width)
-                                       ++col_width;
-                       }
-               }
-               if (!hists__new_col_len(hists, se->se_width_idx, width))
-                       width = hists__col_len(hists, se->se_width_idx);
-               fprintf(fp, "  %*s", width, se->se_header);
-       }
-
-       fprintf(fp, "\n");
-       if (max_rows && ++nr_rows >= max_rows)
-               goto out;
-
-       if (sep)
-               goto print_entries;
-
-       fprintf(fp, "# ........");
-       if (symbol_conf.show_cpu_utilization)
-               fprintf(fp, "   .......   .......");
-       if (symbol_conf.show_nr_samples)
-               fprintf(fp, " ..........");
-       if (symbol_conf.show_total_period)
-               fprintf(fp, " ............");
-       if (pair) {
-               fprintf(fp, " ..........");
-               if (show_displacement)
-                       fprintf(fp, " .....");
-       }
-       list_for_each_entry(se, &hist_entry__sort_list, list) {
-               unsigned int i;
-
-               if (se->elide)
-                       continue;
-
-               fprintf(fp, "  ");
-               width = hists__col_len(hists, se->se_width_idx);
-               if (width == 0)
-                       width = strlen(se->se_header);
-               for (i = 0; i < width; i++)
-                       fprintf(fp, ".");
-       }
-
-       fprintf(fp, "\n");
-       if (max_rows && ++nr_rows >= max_rows)
-               goto out;
-
-       fprintf(fp, "#\n");
-       if (max_rows && ++nr_rows >= max_rows)
-               goto out;
-
-print_entries:
-       total_period = hists->stats.total_period;
-
-       for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
-               struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
-
-               if (h->filtered)
-                       continue;
-
-               if (show_displacement) {
-                       if (h->pair != NULL)
-                               displacement = ((long)h->pair->position -
-                                               (long)position);
-                       else
-                               displacement = 0;
-                       ++position;
-               }
-               ret += hist_entry__fprintf(h, max_cols, hists, pair, show_displacement,
-                                          displacement, total_period, fp);
-
-               if (symbol_conf.use_callchain)
-                       ret += hist_entry__fprintf_callchain(h, hists, total_period, fp);
-               if (max_rows && ++nr_rows >= max_rows)
-                       goto out;
-
-               if (h->ms.map == NULL && verbose > 1) {
-                       __map_groups__fprintf_maps(&h->thread->mg,
-                                                  MAP__FUNCTION, verbose, fp);
-                       fprintf(fp, "%.10s end\n", graph_dotted_line);
-               }
-       }
-out:
-       free(rem_sq_bracket);
-
-       return ret;
-}
-
  /*
   * See hists__fprintf to match the column widths
   */
@@ -1342,25 +723,3 @@ void hists__inc_nr_events(struct hists *hists, u32 type)
         ++hists->stats.nr_events[0];
         ++hists->stats.nr_events[type];
  }
-
-size_t hists__fprintf_nr_events(struct hists *hists, FILE *fp)
-{
-       int i;
-       size_t ret = 0;
-
-       for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) {
-               const char *name;
-
-               if (hists->stats.nr_events[i] == 0)
-                       continue;
-
-               name = perf_event__name(i);
-               if (!strcmp(name, "UNKNOWN"))
-                       continue;
-
-               ret += fprintf(fp, "%16s events: %10d\n", name,
-                              hists->stats.nr_events[i]);
-       }
-
-       return ret;
-}
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h

index 0b096c27a419bb46969e75fe8590f65dbde149be..2e650ffb7d23b211159f52a22c3c9616bb609014 100644 (file)
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -75,8 +75,8 @@ struct hist_entry *__hists__add_entry(struct hists *self,
                                       struct symbol *parent, u64 period);
  int64_t hist_entry__cmp(struct hist_entry *left, struct hist_entry *right);
  int64_t hist_entry__collapse(struct hist_entry *left, struct hist_entry *right);
-int hist_entry__snprintf(struct hist_entry *self, char *bf, size_t size,
-                        struct hists *hists);
+int hist_entry__sort_snprintf(struct hist_entry *self, char *bf, size_t size,
+                             struct hists *hists);
  void hist_entry__free(struct hist_entry *);
  
  struct hist_entry *__hists__add_branch_entry(struct hists *self,
@@ -112,6 +112,8 @@ void hists__filter_by_symbol(struct hists *hists);
  u16 hists__col_len(struct hists *self, enum hist_column col);
  void hists__set_col_len(struct hists *self, enum hist_column col, u16 len);
  bool hists__new_col_len(struct hists *self, enum hist_column col, u16 len);
+void hists__reset_col_len(struct hists *hists);
+void hists__calc_col_len(struct hists *hists, struct hist_entry *he);
  
  struct perf_evlist;
  
diff --git a/tools/perf/util/include/linux/compiler.h b/tools/perf/util/include/linux/compiler.h

index 547628e97f3d2ac4aacf85485f8e5d790575c042..2dc867128e4665759a82f34fe6a48b234819fbc9 100644 (file)
--- a/tools/perf/util/include/linux/compiler.h
+++ b/tools/perf/util/include/linux/compiler.h
@@ -10,5 +10,6 @@
  #endif
  
  #define __used         __attribute__((__unused__))
+#define __packed       __attribute__((__packed__))
  
  #endif
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c

index cc33486ad9e25b80e8486a4c60fce41d03a60b15..7d37159c1e9951f022ebe4b0f1aa4973ef50c2af 100644 (file)
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -86,6 +86,25 @@ out_delete:
         return NULL;
  }
  
+/*
+ * Constructor variant for modules (where we know from /proc/modules where
+ * they are loaded) and for vmlinux, where only after we load all the
+ * symbols we'll know where it starts and ends.
+ */
+struct map *map__new2(u64 start, struct dso *dso, enum map_type type)
+{
+       struct map *map = calloc(1, (sizeof(*map) +
+                                    (dso->kernel ? sizeof(struct kmap) : 0)));
+       if (map != NULL) {
+               /*
+                * ->end will be filled after we load all the symbols
+                */
+               map__init(map, type, start, 0, 0, dso);
+       }
+
+       return map;
+}
+
  void map__delete(struct map *self)
  {
         free(self);
@@ -137,6 +156,7 @@ int map__load(struct map *self, symbol_filter_t filter)
                 pr_warning(", continuing without symbols\n");
                 return -1;
         } else if (nr == 0) {
+#ifndef NO_LIBELF_SUPPORT
                 const size_t len = strlen(name);
                 const size_t real_len = len - sizeof(DSO__DELETED);
  
@@ -149,7 +169,7 @@ int map__load(struct map *self, symbol_filter_t filter)
                         pr_warning("no symbols found in %s, maybe install "
                                    "a debug package?\n", name);
                 }
-
+#endif
                 return -1;
         }
         /*
@@ -242,14 +262,6 @@ u64 map__rip_2objdump(struct map *map, u64 rip)
         return addr;
  }
  
-u64 map__objdump_2ip(struct map *map, u64 addr)
-{
-       u64 ip = map->dso->adjust_symbols ?
-                       addr :
-                       map->unmap_ip(map, addr);       /* RIP -> IP */
-       return ip;
-}
-
  void map_groups__init(struct map_groups *mg)
  {
         int i;
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h

index 03a1e9b08b21a81f4cbdcc7834eb6f6bb7e6c8d1..25ab4cdbc446b4346be8e59c347f40853a5c4ffc 100644 (file)
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -104,7 +104,6 @@ static inline u64 identity__map_ip(struct map *map __used, u64 ip)
  
  /* rip/ip <-> addr suitable for passing to `objdump --start-address=` */
  u64 map__rip_2objdump(struct map *map, u64 rip);
-u64 map__objdump_2ip(struct map *map, u64 addr);
  
  struct symbol;
  
@@ -115,6 +114,7 @@ void map__init(struct map *self, enum map_type type,
  struct map *map__new(struct list_head *dsos__list, u64 start, u64 len,
                      u64 pgoff, u32 pid, char *filename,
                      enum map_type type);
+struct map *map__new2(u64 start, struct dso *dso, enum map_type type);
  void map__delete(struct map *self);
  struct map *map__clone(struct map *self);
  int map__overlap(struct map *l, struct map *r);
@@ -157,9 +157,12 @@ int machine__init(struct machine *self, const char *root_dir, pid_t pid);
  void machine__exit(struct machine *self);
  void machine__delete(struct machine *self);
  
+struct perf_evsel;
+struct perf_sample;
  int machine__resolve_callchain(struct machine *machine,
+                              struct perf_evsel *evsel,
                                struct thread *thread,
-                              struct ip_callchain *chain,
+                              struct perf_sample *sample,
                                struct symbol **parent);
  int maps__set_kallsyms_ref_reloc_sym(struct map **maps, const char *symbol_name,
                                      u64 addr);
diff --git a/tools/perf/util/parse-events-test.c b/tools/perf/util/parse-events-test.c

index 607dd290b31967bd85db631b9ae5a2ef30221896..042e930238f432f40c45225997fd7d0829f77817 100644 (file)
--- a/tools/perf/util/parse-events-test.c
+++ b/tools/perf/util/parse-events-test.c
@@ -18,8 +18,7 @@ do { \
  
  static int test__checkevent_tracepoint(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries);
         TEST_ASSERT_VAL("wrong type", PERF_TYPE_TRACEPOINT == evsel->attr.type);
@@ -48,8 +47,7 @@ static int test__checkevent_tracepoint_multi(struct perf_evlist *evlist)
  
  static int test__checkevent_raw(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries);
         TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->attr.type);
@@ -59,8 +57,7 @@ static int test__checkevent_raw(struct perf_evlist *evlist)
  
  static int test__checkevent_numeric(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries);
         TEST_ASSERT_VAL("wrong type", 1 == evsel->attr.type);
@@ -70,8 +67,7 @@ static int test__checkevent_numeric(struct perf_evlist *evlist)
  
  static int test__checkevent_symbolic_name(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries);
         TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
@@ -82,8 +78,7 @@ static int test__checkevent_symbolic_name(struct perf_evlist *evlist)
  
  static int test__checkevent_symbolic_name_config(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries);
         TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
@@ -100,8 +95,7 @@ static int test__checkevent_symbolic_name_config(struct perf_evlist *evlist)
  
  static int test__checkevent_symbolic_alias(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries);
         TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->attr.type);
@@ -112,8 +106,7 @@ static int test__checkevent_symbolic_alias(struct perf_evlist *evlist)
  
  static int test__checkevent_genhw(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries);
         TEST_ASSERT_VAL("wrong type", PERF_TYPE_HW_CACHE == evsel->attr.type);
@@ -123,8 +116,7 @@ static int test__checkevent_genhw(struct perf_evlist *evlist)
  
  static int test__checkevent_breakpoint(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries);
         TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->attr.type);
@@ -138,8 +130,7 @@ static int test__checkevent_breakpoint(struct perf_evlist *evlist)
  
  static int test__checkevent_breakpoint_x(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries);
         TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->attr.type);
@@ -152,8 +143,7 @@ static int test__checkevent_breakpoint_x(struct perf_evlist *evlist)
  
  static int test__checkevent_breakpoint_r(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries);
         TEST_ASSERT_VAL("wrong type",
@@ -168,8 +158,7 @@ static int test__checkevent_breakpoint_r(struct perf_evlist *evlist)
  
  static int test__checkevent_breakpoint_w(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries);
         TEST_ASSERT_VAL("wrong type",
@@ -184,8 +173,7 @@ static int test__checkevent_breakpoint_w(struct perf_evlist *evlist)
  
  static int test__checkevent_breakpoint_rw(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries);
         TEST_ASSERT_VAL("wrong type",
@@ -200,8 +188,7 @@ static int test__checkevent_breakpoint_rw(struct perf_evlist *evlist)
  
  static int test__checkevent_tracepoint_modifier(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
         TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
@@ -232,8 +219,7 @@ test__checkevent_tracepoint_multi_modifier(struct perf_evlist *evlist)
  
  static int test__checkevent_raw_modifier(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
         TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
@@ -245,8 +231,7 @@ static int test__checkevent_raw_modifier(struct perf_evlist *evlist)
  
  static int test__checkevent_numeric_modifier(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
         TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
@@ -258,8 +243,7 @@ static int test__checkevent_numeric_modifier(struct perf_evlist *evlist)
  
  static int test__checkevent_symbolic_name_modifier(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
         TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
@@ -271,8 +255,7 @@ static int test__checkevent_symbolic_name_modifier(struct perf_evlist *evlist)
  
  static int test__checkevent_exclude_host_modifier(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
         TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
@@ -282,8 +265,7 @@ static int test__checkevent_exclude_host_modifier(struct perf_evlist *evlist)
  
  static int test__checkevent_exclude_guest_modifier(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong exclude guest", evsel->attr.exclude_guest);
         TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
@@ -293,8 +275,7 @@ static int test__checkevent_exclude_guest_modifier(struct perf_evlist *evlist)
  
  static int test__checkevent_symbolic_alias_modifier(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
         TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
@@ -306,8 +287,7 @@ static int test__checkevent_symbolic_alias_modifier(struct perf_evlist *evlist)
  
  static int test__checkevent_genhw_modifier(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
         TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
@@ -319,75 +299,71 @@ static int test__checkevent_genhw_modifier(struct perf_evlist *evlist)
  
  static int test__checkevent_breakpoint_modifier(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
+
  
         TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
         TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
         TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
         TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
         TEST_ASSERT_VAL("wrong name",
-                       !strcmp(perf_evsel__name(evsel), "mem:0x0:rw:u"));
+                       !strcmp(perf_evsel__name(evsel), "mem:0:u"));
  
         return test__checkevent_breakpoint(evlist);
  }
  
  static int test__checkevent_breakpoint_x_modifier(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
         TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
         TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
         TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
         TEST_ASSERT_VAL("wrong name",
-                       !strcmp(perf_evsel__name(evsel), "mem:0x0:x:k"));
+                       !strcmp(perf_evsel__name(evsel), "mem:0:x:k"));
  
         return test__checkevent_breakpoint_x(evlist);
  }
  
  static int test__checkevent_breakpoint_r_modifier(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
         TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
         TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
         TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip);
         TEST_ASSERT_VAL("wrong name",
-                       !strcmp(perf_evsel__name(evsel), "mem:0x0:r:hp"));
+                       !strcmp(perf_evsel__name(evsel), "mem:0:r:hp"));
  
         return test__checkevent_breakpoint_r(evlist);
  }
  
  static int test__checkevent_breakpoint_w_modifier(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
         TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
         TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
         TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip);
         TEST_ASSERT_VAL("wrong name",
-                       !strcmp(perf_evsel__name(evsel), "mem:0x0:w:up"));
+                       !strcmp(perf_evsel__name(evsel), "mem:0:w:up"));
  
         return test__checkevent_breakpoint_w(evlist);
  }
  
  static int test__checkevent_breakpoint_rw_modifier(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
         TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
         TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
         TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip);
         TEST_ASSERT_VAL("wrong name",
-                       !strcmp(perf_evsel__name(evsel), "mem:0x0:rw:kp"));
+                       !strcmp(perf_evsel__name(evsel), "mem:0:rw:kp"));
  
         return test__checkevent_breakpoint_rw(evlist);
  }
@@ -395,8 +371,7 @@ static int test__checkevent_breakpoint_rw_modifier(struct perf_evlist *evlist)
  static int test__checkevent_pmu(struct perf_evlist *evlist)
  {
  
-       struct perf_evsel *evsel = list_entry(evlist->entries.next,
-                                             struct perf_evsel, node);
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries);
         TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->attr.type);
@@ -410,12 +385,11 @@ static int test__checkevent_pmu(struct perf_evlist *evlist)
  
  static int test__checkevent_list(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel;
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         TEST_ASSERT_VAL("wrong number of entries", 3 == evlist->nr_entries);
  
         /* r1 */
-       evsel = list_entry(evlist->entries.next, struct perf_evsel, node);
         TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->attr.type);
         TEST_ASSERT_VAL("wrong config", 1 == evsel->attr.config);
         TEST_ASSERT_VAL("wrong config1", 0 == evsel->attr.config1);
@@ -426,7 +400,7 @@ static int test__checkevent_list(struct perf_evlist *evlist)
         TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
  
         /* syscalls:sys_enter_open:k */
-       evsel = list_entry(evsel->node.next, struct perf_evsel, node);
+       evsel = perf_evsel__next(evsel);
         TEST_ASSERT_VAL("wrong type", PERF_TYPE_TRACEPOINT == evsel->attr.type);
         TEST_ASSERT_VAL("wrong sample_type",
                 PERF_TP_SAMPLE_TYPE == evsel->attr.sample_type);
@@ -437,7 +411,7 @@ static int test__checkevent_list(struct perf_evlist *evlist)
         TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
  
         /* 1:1:hp */
-       evsel = list_entry(evsel->node.next, struct perf_evsel, node);
+       evsel = perf_evsel__next(evsel);
         TEST_ASSERT_VAL("wrong type", 1 == evsel->attr.type);
         TEST_ASSERT_VAL("wrong config", 1 == evsel->attr.config);
         TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
@@ -450,22 +424,21 @@ static int test__checkevent_list(struct perf_evlist *evlist)
  
  static int test__checkevent_pmu_name(struct perf_evlist *evlist)
  {
-       struct perf_evsel *evsel;
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
  
         /* cpu/config=1,name=krava/u */
-       evsel = list_entry(evlist->entries.next, struct perf_evsel, node);
         TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->nr_entries);
         TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->attr.type);
         TEST_ASSERT_VAL("wrong config",  1 == evsel->attr.config);
         TEST_ASSERT_VAL("wrong name", !strcmp(perf_evsel__name(evsel), "krava"));
  
         /* cpu/config=2/u" */
-       evsel = list_entry(evsel->node.next, struct perf_evsel, node);
+       evsel = perf_evsel__next(evsel);
         TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->nr_entries);
         TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->attr.type);
         TEST_ASSERT_VAL("wrong config",  2 == evsel->attr.config);
         TEST_ASSERT_VAL("wrong name",
-                       !strcmp(perf_evsel__name(evsel), "raw 0x2:u"));
+                       !strcmp(perf_evsel__name(evsel), "cpu/config=2/u"));
  
         return 0;
  }
@@ -513,6 +486,280 @@ static int test__checkterms_simple(struct list_head *terms)
         return 0;
  }
  
+static int test__group1(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel, *leader;
+
+       TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->nr_entries);
+
+       /* instructions:k */
+       evsel = leader = perf_evlist__first(evlist);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+       TEST_ASSERT_VAL("wrong config",
+                       PERF_COUNT_HW_INSTRUCTIONS == evsel->attr.config);
+       TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+       TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
+       TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == NULL);
+
+       /* cycles:upp */
+       evsel = perf_evsel__next(evsel);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+       TEST_ASSERT_VAL("wrong config",
+                       PERF_COUNT_HW_CPU_CYCLES == evsel->attr.config);
+       TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+       TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
+       TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip == 2);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+
+       return 0;
+}
+
+static int test__group2(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel, *leader;
+
+       TEST_ASSERT_VAL("wrong number of entries", 3 == evlist->nr_entries);
+
+       /* faults + :ku modifier */
+       evsel = leader = perf_evlist__first(evlist);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->attr.type);
+       TEST_ASSERT_VAL("wrong config",
+                       PERF_COUNT_SW_PAGE_FAULTS == evsel->attr.config);
+       TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+       TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
+       TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == NULL);
+
+       /* cache-references + :u modifier */
+       evsel = perf_evsel__next(evsel);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+       TEST_ASSERT_VAL("wrong config",
+                       PERF_COUNT_HW_CACHE_REFERENCES == evsel->attr.config);
+       TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+       TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
+       TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+
+       /* cycles:k */
+       evsel = perf_evsel__next(evsel);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+       TEST_ASSERT_VAL("wrong config",
+                       PERF_COUNT_HW_CPU_CYCLES == evsel->attr.config);
+       TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+       TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
+       TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == NULL);
+
+       return 0;
+}
+
+static int test__group3(struct perf_evlist *evlist __used)
+{
+       struct perf_evsel *evsel, *leader;
+
+       TEST_ASSERT_VAL("wrong number of entries", 5 == evlist->nr_entries);
+
+       /* group1 syscalls:sys_enter_open:H */
+       evsel = leader = perf_evlist__first(evlist);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_TRACEPOINT == evsel->attr.type);
+       TEST_ASSERT_VAL("wrong sample_type",
+               PERF_TP_SAMPLE_TYPE == evsel->attr.sample_type);
+       TEST_ASSERT_VAL("wrong sample_period", 1 == evsel->attr.sample_period);
+       TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong exclude guest", evsel->attr.exclude_guest);
+       TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
+       TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == NULL);
+       TEST_ASSERT_VAL("wrong group name",
+               !strcmp(leader->group_name, "group1"));
+
+       /* group1 cycles:kppp */
+       evsel = perf_evsel__next(evsel);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+       TEST_ASSERT_VAL("wrong config",
+                       PERF_COUNT_HW_CPU_CYCLES == evsel->attr.config);
+       TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+       TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
+       TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip == 3);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+       TEST_ASSERT_VAL("wrong group name", !evsel->group_name);
+
+       /* group2 cycles + G modifier */
+       evsel = leader = perf_evsel__next(evsel);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+       TEST_ASSERT_VAL("wrong config",
+                       PERF_COUNT_HW_CPU_CYCLES == evsel->attr.config);
+       TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+       TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
+       TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == NULL);
+       TEST_ASSERT_VAL("wrong group name",
+               !strcmp(leader->group_name, "group2"));
+
+       /* group2 1:3 + G modifier */
+       evsel = perf_evsel__next(evsel);
+       TEST_ASSERT_VAL("wrong type", 1 == evsel->attr.type);
+       TEST_ASSERT_VAL("wrong config", 3 == evsel->attr.config);
+       TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+       TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
+       TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+
+       /* instructions:u */
+       evsel = perf_evsel__next(evsel);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+       TEST_ASSERT_VAL("wrong config",
+                       PERF_COUNT_HW_INSTRUCTIONS == evsel->attr.config);
+       TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+       TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
+       TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == NULL);
+
+       return 0;
+}
+
+static int test__group4(struct perf_evlist *evlist __used)
+{
+       struct perf_evsel *evsel, *leader;
+
+       TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->nr_entries);
+
+       /* cycles:u + p */
+       evsel = leader = perf_evlist__first(evlist);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+       TEST_ASSERT_VAL("wrong config",
+                       PERF_COUNT_HW_CPU_CYCLES == evsel->attr.config);
+       TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+       TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
+       TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip == 1);
+       TEST_ASSERT_VAL("wrong group name", !evsel->group_name);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == NULL);
+
+       /* instructions:kp + p */
+       evsel = perf_evsel__next(evsel);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+       TEST_ASSERT_VAL("wrong config",
+                       PERF_COUNT_HW_INSTRUCTIONS == evsel->attr.config);
+       TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+       TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
+       TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip == 2);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+
+       return 0;
+}
+
+static int test__group5(struct perf_evlist *evlist __used)
+{
+       struct perf_evsel *evsel, *leader;
+
+       TEST_ASSERT_VAL("wrong number of entries", 5 == evlist->nr_entries);
+
+       /* cycles + G */
+       evsel = leader = perf_evlist__first(evlist);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+       TEST_ASSERT_VAL("wrong config",
+                       PERF_COUNT_HW_CPU_CYCLES == evsel->attr.config);
+       TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+       TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
+       TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+       TEST_ASSERT_VAL("wrong group name", !evsel->group_name);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == NULL);
+
+       /* instructions + G */
+       evsel = perf_evsel__next(evsel);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+       TEST_ASSERT_VAL("wrong config",
+                       PERF_COUNT_HW_INSTRUCTIONS == evsel->attr.config);
+       TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+       TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
+       TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+
+       /* cycles:G */
+       evsel = leader = perf_evsel__next(evsel);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+       TEST_ASSERT_VAL("wrong config",
+                       PERF_COUNT_HW_CPU_CYCLES == evsel->attr.config);
+       TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+       TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
+       TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+       TEST_ASSERT_VAL("wrong group name", !evsel->group_name);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == NULL);
+
+       /* instructions:G */
+       evsel = perf_evsel__next(evsel);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+       TEST_ASSERT_VAL("wrong config",
+                       PERF_COUNT_HW_INSTRUCTIONS == evsel->attr.config);
+       TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+       TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
+       TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+
+       /* cycles */
+       evsel = perf_evsel__next(evsel);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type);
+       TEST_ASSERT_VAL("wrong config",
+                       PERF_COUNT_HW_CPU_CYCLES == evsel->attr.config);
+       TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong exclude guest", evsel->attr.exclude_guest);
+       TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
+       TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == NULL);
+
+       return 0;
+}
+
  struct test__event_st {
         const char *name;
         __u32 type;
@@ -632,6 +879,26 @@ static struct test__event_st test__events[] = {
                 .name  = "mem:0:rw:kp",
                 .check = test__checkevent_breakpoint_rw_modifier,
         },
+       [28] = {
+               .name  = "{instructions:k,cycles:upp}",
+               .check = test__group1,
+       },
+       [29] = {
+               .name  = "{faults:k,cache-references}:u,cycles:k",
+               .check = test__group2,
+       },
+       [30] = {
+               .name  = "group1{syscalls:sys_enter_open:H,cycles:kppp},group2{cycles,1:3}:G,instructions:u",
+               .check = test__group3,
+       },
+       [31] = {
+               .name  = "{cycles:u,instructions:kp}:p",
+               .check = test__group4,
+       },
+       [32] = {
+               .name  = "{cycles,instructions}:G,{cycles:G,instructions:G},cycles",
+               .check = test__group5,
+       },
  };
  
  static struct test__event_st test__events_pmu[] = {
@@ -658,9 +925,6 @@ static struct test__term test__terms[] = {
         },
  };
  
-#define TEST__TERMS_CNT (sizeof(test__terms) / \
-                        sizeof(struct test__term))
-
  static int test_event(struct test__event_st *e)
  {
         struct perf_evlist *evlist;
@@ -685,19 +949,19 @@ static int test_event(struct test__event_st *e)
  
  static int test_events(struct test__event_st *events, unsigned cnt)
  {
-       int ret = 0;
+       int ret1, ret2 = 0;
         unsigned i;
  
         for (i = 0; i < cnt; i++) {
                 struct test__event_st *e = &events[i];
  
                 pr_debug("running test %d '%s'\n", i, e->name);
-               ret = test_event(e);
-               if (ret)
-                       break;
+               ret1 = test_event(e);
+               if (ret1)
+                       ret2 = ret1;
         }
  
-       return ret;
+       return ret2;
  }
  
  static int test_term(struct test__term *t)
@@ -758,13 +1022,13 @@ static int test_pmu(void)
  
  int parse_events__test(void)
  {
-       int ret;
+       int ret1, ret2 = 0;
  
  #define TEST_EVENTS(tests)                             \
  do {                                                   \
-       ret = test_events(tests, ARRAY_SIZE(tests));    \
-       if (ret)                                        \
-               return ret;                             \
+       ret1 = test_events(tests, ARRAY_SIZE(tests));   \
+       if (!ret2)                                      \
+               ret2 = ret1;                            \
  } while (0)
  
         TEST_EVENTS(test__events);
@@ -772,5 +1036,9 @@ do {                                                       \
         if (test_pmu())
                 TEST_EVENTS(test__events_pmu);
  
-       return test_terms(test__terms, ARRAY_SIZE(test__terms));
+       ret1 = test_terms(test__terms, ARRAY_SIZE(test__terms));
+       if (!ret2)
+               ret2 = ret1;
+
+       return ret2;
  }
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c

index 74a5af4d33ec5fc4720639c7634d37cf94e54a92..b24630398b92b1ac7f863a73e3e0f2440769e6bc 100644 (file)
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -551,7 +551,7 @@ static int config_attr(struct perf_event_attr *attr,
  }
  
  int parse_events_add_numeric(struct list_head **list, int *idx,
-                            unsigned long type, unsigned long config,
+                            u32 type, u64 config,
                              struct list_head *head_config)
  {
         struct perf_event_attr attr;
@@ -611,26 +611,65 @@ int parse_events_add_pmu(struct list_head **list, int *idx,
                          pmu_event_name(head_config));
  }
  
+int parse_events__modifier_group(struct list_head *list,
+                                char *event_mod)
+{
+       return parse_events__modifier_event(list, event_mod, true);
+}
+
+void parse_events__set_leader(char *name, struct list_head *list)
+{
+       struct perf_evsel *leader;
+
+       __perf_evlist__set_leader(list);
+       leader = list_entry(list->next, struct perf_evsel, node);
+       leader->group_name = name ? strdup(name) : NULL;
+}
+
  void parse_events_update_lists(struct list_head *list_event,
                                struct list_head *list_all)
  {
         /*
          * Called for single event definition. Update the
-        * 'all event' list, and reinit the 'signle event'
+        * 'all event' list, and reinit the 'single event'
          * list, for next event definition.
          */
         list_splice_tail(list_event, list_all);
         free(list_event);
  }
  
-int parse_events_modifier(struct list_head *list, char *str)
+struct event_modifier {
+       int eu;
+       int ek;
+       int eh;
+       int eH;
+       int eG;
+       int precise;
+       int exclude_GH;
+};
+
+static int get_event_modifier(struct event_modifier *mod, char *str,
+                              struct perf_evsel *evsel)
  {
-       struct perf_evsel *evsel;
-       int exclude = 0, exclude_GH = 0;
-       int eu = 0, ek = 0, eh = 0, eH = 0, eG = 0, precise = 0;
+       int eu = evsel ? evsel->attr.exclude_user : 0;
+       int ek = evsel ? evsel->attr.exclude_kernel : 0;
+       int eh = evsel ? evsel->attr.exclude_hv : 0;
+       int eH = evsel ? evsel->attr.exclude_host : 0;
+       int eG = evsel ? evsel->attr.exclude_guest : 0;
+       int precise = evsel ? evsel->attr.precise_ip : 0;
  
-       if (str == NULL)
-               return 0;
+       int exclude = eu | ek | eh;
+       int exclude_GH = evsel ? evsel->exclude_GH : 0;
+
+       /*
+        * We are here for group and 'GH' was not set as event
+        * modifier and whatever event/group modifier override
+        * default 'GH' setup.
+        */
+       if (evsel && !exclude_GH)
+               eH = eG = 0;
+
+       memset(mod, 0, sizeof(*mod));
  
         while (*str) {
                 if (*str == 'u') {
@@ -674,13 +713,51 @@ int parse_events_modifier(struct list_head *list, char *str)
         if (precise > 3)
                 return -EINVAL;
  
+       mod->eu = eu;
+       mod->ek = ek;
+       mod->eh = eh;
+       mod->eH = eH;
+       mod->eG = eG;
+       mod->precise = precise;
+       mod->exclude_GH = exclude_GH;
+       return 0;
+}
+
+int parse_events__modifier_event(struct list_head *list, char *str, bool add)
+{
+       struct perf_evsel *evsel;
+       struct event_modifier mod;
+
+       if (str == NULL)
+               return 0;
+
+       if (!add && get_event_modifier(&mod, str, NULL))
+               return -EINVAL;
+
         list_for_each_entry(evsel, list, node) {
-               evsel->attr.exclude_user   = eu;
-               evsel->attr.exclude_kernel = ek;
-               evsel->attr.exclude_hv     = eh;
-               evsel->attr.precise_ip     = precise;
-               evsel->attr.exclude_host   = eH;
-               evsel->attr.exclude_guest  = eG;
+
+               if (add && get_event_modifier(&mod, str, evsel))
+                       return -EINVAL;
+
+               evsel->attr.exclude_user   = mod.eu;
+               evsel->attr.exclude_kernel = mod.ek;
+               evsel->attr.exclude_hv     = mod.eh;
+               evsel->attr.precise_ip     = mod.precise;
+               evsel->attr.exclude_host   = mod.eH;
+               evsel->attr.exclude_guest  = mod.eG;
+               evsel->exclude_GH          = mod.exclude_GH;
+       }
+
+       return 0;
+}
+
+int parse_events_name(struct list_head *list, char *name)
+{
+       struct perf_evsel *evsel;
+
+       list_for_each_entry(evsel, list, node) {
+               if (!evsel->name)
+                       evsel->name = strdup(name);
         }
  
         return 0;
@@ -769,7 +846,7 @@ int parse_filter(const struct option *opt, const char *str,
         struct perf_evsel *last = NULL;
  
         if (evlist->nr_entries > 0)
-               last = list_entry(evlist->entries.prev, struct perf_evsel, node);
+               last = perf_evlist__last(evlist);
  
         if (last == NULL || last->attr.type != PERF_TYPE_TRACEPOINT) {
                 fprintf(stderr,
@@ -799,7 +876,8 @@ static const char * const event_type_descriptors[] = {
   * Print the events from <debugfs_mount_point>/tracing/events
   */
  
-void print_tracepoint_events(const char *subsys_glob, const char *event_glob)
+void print_tracepoint_events(const char *subsys_glob, const char *event_glob,
+                            bool name_only)
  {
         DIR *sys_dir, *evt_dir;
         struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
@@ -829,6 +907,11 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob)
                             !strglobmatch(evt_dirent.d_name, event_glob))
                                 continue;
  
+                       if (name_only) {
+                               printf("%s:%s ", sys_dirent.d_name, evt_dirent.d_name);
+                               continue;
+                       }
+
                         snprintf(evt_path, MAXPATHLEN, "%s:%s",
                                  sys_dirent.d_name, evt_dirent.d_name);
                         printf("  %-50s [%s]\n", evt_path,
@@ -906,7 +989,7 @@ void print_events_type(u8 type)
                 __print_events_type(type, event_symbols_hw, PERF_COUNT_HW_MAX);
  }
  
-int print_hwcache_events(const char *event_glob)
+int print_hwcache_events(const char *event_glob, bool name_only)
  {
         unsigned int type, op, i, printed = 0;
         char name[64];
@@ -923,8 +1006,11 @@ int print_hwcache_events(const char *event_glob)
                                 if (event_glob != NULL && !strglobmatch(name, event_glob))
                                         continue;
  
-                               printf("  %-50s [%s]\n", name,
-                                       event_type_descriptors[PERF_TYPE_HW_CACHE]);
+                               if (name_only)
+                                       printf("%s ", name);
+                               else
+                                       printf("  %-50s [%s]\n", name,
+                                              event_type_descriptors[PERF_TYPE_HW_CACHE]);
                                 ++printed;
                         }
                 }
@@ -934,7 +1020,8 @@ int print_hwcache_events(const char *event_glob)
  }
  
  static void print_symbol_events(const char *event_glob, unsigned type,
-                               struct event_symbol *syms, unsigned max)
+                               struct event_symbol *syms, unsigned max,
+                               bool name_only)
  {
         unsigned i, printed = 0;
         char name[MAX_NAME_LEN];
@@ -946,6 +1033,11 @@ static void print_symbol_events(const char *event_glob, unsigned type,
                       (syms->alias && strglobmatch(syms->alias, event_glob))))
                         continue;
  
+               if (name_only) {
+                       printf("%s ", syms->symbol);
+                       continue;
+               }
+
                 if (strlen(syms->alias))
                         snprintf(name, MAX_NAME_LEN, "%s OR %s", syms->symbol, syms->alias);
                 else
@@ -963,39 +1055,42 @@ static void print_symbol_events(const char *event_glob, unsigned type,
  /*
   * Print the help text for the event symbols:
   */
-void print_events(const char *event_glob)
+void print_events(const char *event_glob, bool name_only)
  {
-
-       printf("\n");
-       printf("List of pre-defined events (to be used in -e):\n");
+       if (!name_only) {
+               printf("\n");
+               printf("List of pre-defined events (to be used in -e):\n");
+       }
  
         print_symbol_events(event_glob, PERF_TYPE_HARDWARE,
-                           event_symbols_hw, PERF_COUNT_HW_MAX);
+                           event_symbols_hw, PERF_COUNT_HW_MAX, name_only);
  
         print_symbol_events(event_glob, PERF_TYPE_SOFTWARE,
-                           event_symbols_sw, PERF_COUNT_SW_MAX);
+                           event_symbols_sw, PERF_COUNT_SW_MAX, name_only);
  
-       print_hwcache_events(event_glob);
+       print_hwcache_events(event_glob, name_only);
  
         if (event_glob != NULL)
                 return;
  
-       printf("\n");
-       printf("  %-50s [%s]\n",
-              "rNNN",
-              event_type_descriptors[PERF_TYPE_RAW]);
-       printf("  %-50s [%s]\n",
-              "cpu/t1=v1[,t2=v2,t3 ...]/modifier",
-              event_type_descriptors[PERF_TYPE_RAW]);
-       printf("   (see 'perf list --help' on how to encode it)\n");
-       printf("\n");
-
-       printf("  %-50s [%s]\n",
-                       "mem:<addr>[:access]",
+       if (!name_only) {
+               printf("\n");
+               printf("  %-50s [%s]\n",
+                      "rNNN",
+                      event_type_descriptors[PERF_TYPE_RAW]);
+               printf("  %-50s [%s]\n",
+                      "cpu/t1=v1[,t2=v2,t3 ...]/modifier",
+                      event_type_descriptors[PERF_TYPE_RAW]);
+               printf("   (see 'perf list --help' on how to encode it)\n");
+               printf("\n");
+
+               printf("  %-50s [%s]\n",
+                      "mem:<addr>[:access]",
                         event_type_descriptors[PERF_TYPE_BREAKPOINT]);
-       printf("\n");
+               printf("\n");
+       }
  
-       print_tracepoint_events(NULL, NULL);
+       print_tracepoint_events(NULL, NULL, name_only);
  }
  
  int parse_events__is_hardcoded_term(struct parse_events__term *term)
@@ -1005,7 +1100,7 @@ int parse_events__is_hardcoded_term(struct parse_events__term *term)
  
  static int new_term(struct parse_events__term **_term, int type_val,
                     int type_term, char *config,
-                   char *str, long num)
+                   char *str, u64 num)
  {
         struct parse_events__term *term;
  
@@ -1034,7 +1129,7 @@ static int new_term(struct parse_events__term **_term, int type_val,
  }
  
  int parse_events__term_num(struct parse_events__term **term,
-                          int type_term, char *config, long num)
+                          int type_term, char *config, u64 num)
  {
         return new_term(term, PARSE_EVENTS__TERM_TYPE_NUM, type_term,
                         config, NULL, num);
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h

index ee9c218a193c8d8c8b20f893da976d262931e172..c356e443448dbd0f5521bc7cce1835a400e9dda7 100644 (file)
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -55,7 +55,7 @@ struct parse_events__term {
         char *config;
         union {
                 char *str;
-               long  num;
+               u64  num;
         } val;
         int type_val;
         int type_term;
@@ -73,17 +73,19 @@ struct parse_events_data__terms {
  
  int parse_events__is_hardcoded_term(struct parse_events__term *term);
  int parse_events__term_num(struct parse_events__term **_term,
-                          int type_term, char *config, long num);
+                          int type_term, char *config, u64 num);
  int parse_events__term_str(struct parse_events__term **_term,
                            int type_term, char *config, char *str);
  int parse_events__term_clone(struct parse_events__term **new,
                              struct parse_events__term *term);
  void parse_events__free_terms(struct list_head *terms);
-int parse_events_modifier(struct list_head *list, char *str);
+int parse_events__modifier_event(struct list_head *list, char *str, bool add);
+int parse_events__modifier_group(struct list_head *list, char *event_mod);
+int parse_events_name(struct list_head *list, char *name);
  int parse_events_add_tracepoint(struct list_head **list, int *idx,
                                 char *sys, char *event);
  int parse_events_add_numeric(struct list_head **list, int *idx,
-                            unsigned long type, unsigned long config,
+                            u32 type, u64 config,
                              struct list_head *head_config);
  int parse_events_add_cache(struct list_head **list, int *idx,
                            char *type, char *op_result1, char *op_result2);
@@ -91,15 +93,17 @@ int parse_events_add_breakpoint(struct list_head **list, int *idx,
                                 void *ptr, char *type);
  int parse_events_add_pmu(struct list_head **list, int *idx,
                          char *pmu , struct list_head *head_config);
+void parse_events__set_leader(char *name, struct list_head *list);
  void parse_events_update_lists(struct list_head *list_event,
                                struct list_head *list_all);
  void parse_events_error(void *data, void *scanner, char const *msg);
  int parse_events__test(void);
  
-void print_events(const char *event_glob);
+void print_events(const char *event_glob, bool name_only);
  void print_events_type(u8 type);
-void print_tracepoint_events(const char *subsys_glob, const char *event_glob);
-int print_hwcache_events(const char *event_glob);
+void print_tracepoint_events(const char *subsys_glob, const char *event_glob,
+                            bool name_only);
+int print_hwcache_events(const char *event_glob, bool name_only);
  extern int is_valid_tracepoint(const char *event_string);
  
  extern int valid_debugfs_mount(const char *debugfs);
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l

index 384ca74c6b22d442ad74b23d70dd3598a4e7208a..f5e28dc68270e82a6d9b374a2282d04d1acb1186 100644 (file)
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -15,10 +15,10 @@ YYSTYPE *parse_events_get_lval(yyscan_t yyscanner);
  
  static int __value(YYSTYPE *yylval, char *str, int base, int token)
  {
-       long num;
+       u64 num;
  
         errno = 0;
-       num = strtoul(str, NULL, base);
+       num = strtoull(str, NULL, base);
         if (errno)
                 return PE_ERROR;
  
@@ -70,6 +70,12 @@ static int term(yyscan_t scanner, int type)
  %}
  
  %x mem
+%s config
+%x event
+
+group          [^,{}/]*[{][^}]*[}][^,{}/]*
+event_pmu      [^,{}/]+[/][^/]*[/][^,{}/]*
+event          [^,{}/]+
  
  num_dec                [0-9]+
  num_hex                0x[a-fA-F0-9]+
@@ -84,7 +90,13 @@ modifier_bp  [rwx]{1,3}
         {
                 int start_token;
  
-               start_token = (int) parse_events_get_extra(yyscanner);
+               start_token = parse_events_get_extra(yyscanner);
+
+               if (start_token == PE_START_TERMS)
+                       BEGIN(config);
+               else if (start_token == PE_START_EVENTS)
+                       BEGIN(event);
+
                 if (start_token) {
                         parse_events_set_extra(NULL, yyscanner);
                         return start_token;
@@ -92,6 +104,26 @@ modifier_bp [rwx]{1,3}
           }
  %}
  
+<event>{
+
+{group}                {
+                       BEGIN(INITIAL); yyless(0);
+               }
+
+{event_pmu}    |
+{event}                {
+                       str(yyscanner, PE_EVENT_NAME);
+                       BEGIN(INITIAL); yyless(0);
+                       return PE_EVENT_NAME;
+               }
+
+.              |
+<<EOF>>                {
+                       BEGIN(INITIAL); yyless(0);
+               }
+
+}
+
  cpu-cycles|cycles                              { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES); }
  stalled-cycles-frontend|idle-cycles-frontend   { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND); }
  stalled-cycles-backend|idle-cycles-backend     { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND); }
@@ -127,18 +159,16 @@ speculative-read|speculative-load |
  refs|Reference|ops|access              |
  misses|miss                            { return str(yyscanner, PE_NAME_CACHE_OP_RESULT); }
  
-       /*
-        * These are event config hardcoded term names to be specified
-        * within xxx/.../ syntax. So far we dont clash with other names,
-        * so we can put them here directly. In case the we have a conflict
-        * in future, this needs to go into '//' condition block.
-        */
+<config>{
  config                 { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG); }
  config1                        { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG1); }
  config2                        { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG2); }
  name                   { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NAME); }
  period                 { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD); }
  branch_type            { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE); }
+,                      { return ','; }
+"/"                    { BEGIN(INITIAL); return '/'; }
+}
  
  mem:                   { BEGIN(mem); return PE_PREFIX_MEM; }
  r{num_raw_hex}         { return raw(yyscanner); }
@@ -147,10 +177,12 @@ r{num_raw_hex}            { return raw(yyscanner); }
  
  {modifier_event}       { return str(yyscanner, PE_MODIFIER_EVENT); }
  {name}                 { return str(yyscanner, PE_NAME); }
-"/"                    { return '/'; }
+"/"                    { BEGIN(config); return '/'; }
  -                      { return '-'; }
-,                      { return ','; }
+,                      { BEGIN(event); return ','; }
  :                      { return ':'; }
+"{"                    { BEGIN(event); return '{'; }
+"}"                    { return '}'; }
  =                      { return '='; }
  \n                     { }
  
diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y

index 2bc5fbff2b5d2b101ac4b65cb9623c5d7ff24326..42d9a17b83b1c526a5582d642453f31d64f33a97 100644 (file)
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -27,10 +27,11 @@ do { \
  
  %token PE_START_EVENTS PE_START_TERMS
  %token PE_VALUE PE_VALUE_SYM_HW PE_VALUE_SYM_SW PE_RAW PE_TERM
+%token PE_EVENT_NAME
  %token PE_NAME
  %token PE_MODIFIER_EVENT PE_MODIFIER_BP
  %token PE_NAME_CACHE_TYPE PE_NAME_CACHE_OP_RESULT
-%token PE_PREFIX_MEM PE_PREFIX_RAW
+%token PE_PREFIX_MEM PE_PREFIX_RAW PE_PREFIX_GROUP
  %token PE_ERROR
  %type <num> PE_VALUE
  %type <num> PE_VALUE_SYM_HW
@@ -42,6 +43,7 @@ do { \
  %type <str> PE_NAME_CACHE_OP_RESULT
  %type <str> PE_MODIFIER_EVENT
  %type <str> PE_MODIFIER_BP
+%type <str> PE_EVENT_NAME
  %type <num> value_sym
  %type <head> event_config
  %type <term> event_term
@@ -53,44 +55,125 @@ do { \
  %type <head> event_legacy_numeric
  %type <head> event_legacy_raw
  %type <head> event_def
+%type <head> event_mod
+%type <head> event_name
+%type <head> event
+%type <head> events
+%type <head> group_def
+%type <head> group
+%type <head> groups
  
  %union
  {
         char *str;
-       unsigned long num;
+       u64 num;
         struct list_head *head;
         struct parse_events__term *term;
  }
  %%
  
  start:
-PE_START_EVENTS events
+PE_START_EVENTS start_events
  |
-PE_START_TERMS  terms
+PE_START_TERMS  start_terms
+
+start_events: groups
+{
+       struct parse_events_data__events *data = _data;
+
+       parse_events_update_lists($1, &data->list);
+}
+
+groups:
+groups ',' group
+{
+       struct list_head *list  = $1;
+       struct list_head *group = $3;
+
+       parse_events_update_lists(group, list);
+       $$ = list;
+}
+|
+groups ',' event
+{
+       struct list_head *list  = $1;
+       struct list_head *event = $3;
+
+       parse_events_update_lists(event, list);
+       $$ = list;
+}
+|
+group
+|
+event
+
+group:
+group_def ':' PE_MODIFIER_EVENT
+{
+       struct list_head *list = $1;
+
+       ABORT_ON(parse_events__modifier_group(list, $3));
+       $$ = list;
+}
+|
+group_def
+
+group_def:
+PE_NAME '{' events '}'
+{
+       struct list_head *list = $3;
+
+       parse_events__set_leader($1, list);
+       $$ = list;
+}
+|
+'{' events '}'
+{
+       struct list_head *list = $2;
+
+       parse_events__set_leader(NULL, list);
+       $$ = list;
+}
  
  events:
-events ',' event | event
+events ',' event
+{
+       struct list_head *event = $3;
+       struct list_head *list  = $1;
  
-event:
-event_def PE_MODIFIER_EVENT
+       parse_events_update_lists(event, list);
+       $$ = list;
+}
+|
+event
+
+event: event_mod
+
+event_mod:
+event_name PE_MODIFIER_EVENT
  {
-       struct parse_events_data__events *data = _data;
+       struct list_head *list = $1;
  
         /*
          * Apply modifier on all events added by single event definition
          * (there could be more events added for multiple tracepoint
          * definitions via '*?'.
          */
-       ABORT_ON(parse_events_modifier($1, $2));
-       parse_events_update_lists($1, &data->list);
+       ABORT_ON(parse_events__modifier_event(list, $2, false));
+       $$ = list;
  }
  |
-event_def
-{
-       struct parse_events_data__events *data = _data;
+event_name
  
-       parse_events_update_lists($1, &data->list);
+event_name:
+PE_EVENT_NAME event_def
+{
+       ABORT_ON(parse_events_name($2, $1));
+       free($1);
+       $$ = $2;
  }
+|
+event_def
  
  event_def: event_pmu |
            event_legacy_symbol |
@@ -207,7 +290,7 @@ PE_VALUE ':' PE_VALUE
         struct parse_events_data__events *data = _data;
         struct list_head *list = NULL;
  
-       ABORT_ON(parse_events_add_numeric(&list, &data->idx, $1, $3, NULL));
+       ABORT_ON(parse_events_add_numeric(&list, &data->idx, (u32)$1, $3, NULL));
         $$ = list;
  }
  
@@ -222,7 +305,7 @@ PE_RAW
         $$ = list;
  }
  
-terms: event_config
+start_terms: event_config
  {
         struct parse_events_data__terms *data = _data;
         data->terms = $1;
@@ -282,7 +365,7 @@ PE_TERM '=' PE_NAME
  {
         struct parse_events__term *term;
  
-       ABORT_ON(parse_events__term_str(&term, $1, NULL, $3));
+       ABORT_ON(parse_events__term_str(&term, (int)$1, NULL, $3));
         $$ = term;
  }
  |
@@ -290,7 +373,7 @@ PE_TERM '=' PE_VALUE
  {
         struct parse_events__term *term;
  
-       ABORT_ON(parse_events__term_num(&term, $1, NULL, $3));
+       ABORT_ON(parse_events__term_num(&term, (int)$1, NULL, $3));
         $$ = term;
  }
  |
@@ -298,7 +381,7 @@ PE_TERM
  {
         struct parse_events__term *term;
  
-       ABORT_ON(parse_events__term_num(&term, $1, NULL, 1));
+       ABORT_ON(parse_events__term_num(&term, (int)$1, NULL, 1));
         $$ = term;
  }
  
diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h

new file mode 100644 (file)

index 0000000..9bd6c4e
--- /dev/null
+++ b/tools/perf/util/perf_regs.h
@@ -0,0 +1,14 @@
+#ifndef __PERF_REGS_H
+#define __PERF_REGS_H
+
+#ifndef NO_PERF_REGS
+#include <perf_regs.h>
+#else
+#define PERF_REGS_MASK 0
+
+static inline const char *perf_reg_name(int id __used)
+{
+       return NULL;
+}
+#endif /* NO_PERF_REGS */
+#endif /* __PERF_REGS_H */
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c

index 67715a42cd6dc377e0056dff9590404734199d14..6631d828db3dc131dad22c02489bf47efe039070 100644 (file)
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -10,6 +10,8 @@
  #include "pmu.h"
  #include "parse-events.h"
  
+#define EVENT_SOURCE_DEVICE_PATH "/bus/event_source/devices/"
+
  int perf_pmu_parse(struct list_head *list, char *name);
  extern FILE *perf_pmu_in;
  
@@ -69,7 +71,7 @@ static int pmu_format(char *name, struct list_head *format)
                 return -1;
  
         snprintf(path, PATH_MAX,
-                "%s/bus/event_source/devices/%s/format", sysfs, name);
+                "%s" EVENT_SOURCE_DEVICE_PATH "%s/format", sysfs, name);
  
         if (stat(path, &st) < 0)
                 return 0;       /* no error if format does not exist */
@@ -206,7 +208,7 @@ static int pmu_type(char *name, __u32 *type)
                 return -1;
  
         snprintf(path, PATH_MAX,
-                "%s/bus/event_source/devices/%s/type", sysfs, name);
+                "%s" EVENT_SOURCE_DEVICE_PATH "%s/type", sysfs, name);
  
         if (stat(path, &st) < 0)
                 return -1;
@@ -222,6 +224,35 @@ static int pmu_type(char *name, __u32 *type)
         return ret;
  }
  
+/* Add all pmus in sysfs to pmu list: */
+static void pmu_read_sysfs(void)
+{
+       char path[PATH_MAX];
+       const char *sysfs;
+       DIR *dir;
+       struct dirent *dent;
+
+       sysfs = sysfs_find_mountpoint();
+       if (!sysfs)
+               return;
+
+       snprintf(path, PATH_MAX,
+                "%s" EVENT_SOURCE_DEVICE_PATH, sysfs);
+
+       dir = opendir(path);
+       if (!dir)
+               return;
+
+       while ((dent = readdir(dir))) {
+               if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, ".."))
+                       continue;
+               /* add to static LIST_HEAD(pmus): */
+               perf_pmu__find(dent->d_name);
+       }
+
+       closedir(dir);
+}
+
  static struct perf_pmu *pmu_lookup(char *name)
  {
         struct perf_pmu *pmu;
@@ -267,6 +298,21 @@ static struct perf_pmu *pmu_find(char *name)
         return NULL;
  }
  
+struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu)
+{
+       /*
+        * pmu iterator: If pmu is NULL, we start at the begin,
+        * otherwise return the next pmu. Returns NULL on end.
+        */
+       if (!pmu) {
+               pmu_read_sysfs();
+               pmu = list_prepare_entry(pmu, &pmus, list);
+       }
+       list_for_each_entry_continue(pmu, &pmus, list)
+               return pmu;
+       return NULL;
+}
+
  struct perf_pmu *perf_pmu__find(char *name)
  {
         struct perf_pmu *pmu;
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h

index 535f2c5258ab05a8110727704c98791c850dc588..47f68d3cc5d1e51354ae7f2df5f06cfa50372b2c 100644 (file)
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -46,5 +46,7 @@ int perf_pmu__new_format(struct list_head *list, char *name,
                          int config, unsigned long *bits);
  void perf_pmu__set_format(unsigned long *bits, long from, long to);
  
+struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu);
+
  int perf_pmu__test(void);
  #endif /* __PMU_H */
diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c

index 0688bfb6d280a3b8ae3b8c0e3a474f51b9a404c2..27187f0b71f092e3a406dec7a7ace05762612c3f 100644 (file)
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -627,7 +627,7 @@ static PyObject *pyrf_evsel__open(struct pyrf_evsel *pevsel,
          * This will group just the fds for this single evsel, to group
          * multiple events, use evlist.open().
          */
-       if (perf_evsel__open(evsel, cpus, threads, group, NULL) < 0) {
+       if (perf_evsel__open(evsel, cpus, threads) < 0) {
                 PyErr_SetFromErrno(PyExc_OSError);
                 return NULL;
         }
@@ -824,7 +824,10 @@ static PyObject *pyrf_evlist__open(struct pyrf_evlist *pevlist,
         if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|OOii", kwlist, &group))
                 return NULL;
  
-       if (perf_evlist__open(evlist, group) < 0) {
+       if (group)
+               perf_evlist__set_leader(evlist);
+
+       if (perf_evlist__open(evlist) < 0) {
                 PyErr_SetFromErrno(PyExc_OSError);
                 return NULL;
         }
diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c

index 02dfa19a467fcde82abfb0e0802a26a5579c9890..d28001016fb59c99c46f0e516beb7da3bafea903 100644 (file)
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -237,16 +237,16 @@ static void define_event_symbols(struct event_format *event,
                 define_event_symbols(event, ev_name, args->next);
  }
  
-static inline
-struct event_format *find_cache_event(struct pevent *pevent, int type)
+static inline struct event_format *find_cache_event(struct perf_evsel *evsel)
  {
         static char ev_name[256];
         struct event_format *event;
+       int type = evsel->attr.config;
  
         if (events[type])
                 return events[type];
  
-       events[type] = event = pevent_find_event(pevent, type);
+       events[type] = event = evsel->tp_format;
         if (!event)
                 return NULL;
  
@@ -258,22 +258,21 @@ struct event_format *find_cache_event(struct pevent *pevent, int type)
  }
  
  static void perl_process_tracepoint(union perf_event *perf_event __unused,
-                                   struct pevent *pevent,
                                     struct perf_sample *sample,
                                     struct perf_evsel *evsel,
                                     struct machine *machine __unused,
-                                   struct thread *thread)
+                                   struct addr_location *al)
  {
         struct format_field *field;
         static char handler[256];
         unsigned long long val;
         unsigned long s, ns;
         struct event_format *event;
-       int type;
         int pid;
         int cpu = sample->cpu;
         void *data = sample->raw_data;
         unsigned long long nsecs = sample->time;
+       struct thread *thread = al->thread;
         char *comm = thread->comm;
  
         dSP;
@@ -281,13 +280,11 @@ static void perl_process_tracepoint(union perf_event *perf_event __unused,
         if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
                 return;
  
-       type = trace_parse_common_type(pevent, data);
-
-       event = find_cache_event(pevent, type);
+       event = find_cache_event(evsel);
         if (!event)
-               die("ug! no event found for type %d", type);
+               die("ug! no event found for type %d", evsel->attr.config);
  
-       pid = trace_parse_common_pid(pevent, data);
+       pid = raw_field_value(event, "common_pid", data);
  
         sprintf(handler, "%s::%s", event->system, event->name);
  
@@ -320,7 +317,7 @@ static void perl_process_tracepoint(union perf_event *perf_event __unused,
                                 offset = field->offset;
                         XPUSHs(sv_2mortal(newSVpv((char *)data + offset, 0)));
                 } else { /* FIELD_IS_NUMERIC */
-                       val = read_size(pevent, data + field->offset,
+                       val = read_size(event, data + field->offset,
                                         field->size);
                         if (field->flags & FIELD_IS_SIGNED) {
                                 XPUSHs(sv_2mortal(newSViv(val)));
@@ -349,11 +346,11 @@ static void perl_process_tracepoint(union perf_event *perf_event __unused,
         LEAVE;
  }
  
-static void perl_process_event_generic(union perf_event *pevent __unused,
+static void perl_process_event_generic(union perf_event *event,
                                        struct perf_sample *sample,
-                                      struct perf_evsel *evsel __unused,
+                                      struct perf_evsel *evsel,
                                        struct machine *machine __unused,
-                                      struct thread *thread __unused)
+                                      struct addr_location *al __unused)
  {
         dSP;
  
@@ -363,7 +360,7 @@ static void perl_process_event_generic(union perf_event *pevent __unused,
         ENTER;
         SAVETMPS;
         PUSHMARK(SP);
-       XPUSHs(sv_2mortal(newSVpvn((const char *)pevent, pevent->header.size)));
+       XPUSHs(sv_2mortal(newSVpvn((const char *)event, event->header.size)));
         XPUSHs(sv_2mortal(newSVpvn((const char *)&evsel->attr, sizeof(evsel->attr))));
         XPUSHs(sv_2mortal(newSVpvn((const char *)sample, sizeof(*sample))));
         XPUSHs(sv_2mortal(newSVpvn((const char *)sample->raw_data, sample->raw_size)));
@@ -376,14 +373,13 @@ static void perl_process_event_generic(union perf_event *pevent __unused,
  }
  
  static void perl_process_event(union perf_event *event,
-                              struct pevent *pevent,
                                struct perf_sample *sample,
                                struct perf_evsel *evsel,
                                struct machine *machine,
-                              struct thread *thread)
+                              struct addr_location *al)
  {
-       perl_process_tracepoint(event, pevent, sample, evsel, machine, thread);
-       perl_process_event_generic(event, sample, evsel, machine, thread);
+       perl_process_tracepoint(event, sample, evsel, machine, al);
+       perl_process_event_generic(event, sample, evsel, machine, al);
  }
  
  static void run_start_sub(void)
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c

index ce4d1b0c38626d800defaf373b231ae9171f3904..afba097291838228b986069b49661b154af4fa8e 100644 (file)
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -27,10 +27,12 @@
  #include <errno.h>
  
  #include "../../perf.h"
+#include "../evsel.h"
  #include "../util.h"
  #include "../event.h"
  #include "../thread.h"
  #include "../trace-event.h"
+#include "../evsel.h"
  
  PyMODINIT_FUNC initperf_trace_context(void);
  
@@ -194,16 +196,21 @@ static void define_event_symbols(struct event_format *event,
                 define_event_symbols(event, ev_name, args->next);
  }
  
-static inline
-struct event_format *find_cache_event(struct pevent *pevent, int type)
+static inline struct event_format *find_cache_event(struct perf_evsel *evsel)
  {
         static char ev_name[256];
         struct event_format *event;
+       int type = evsel->attr.config;
  
+       /*
+        * XXX: Do we really need to cache this since now we have evsel->tp_format
+        * cached already? Need to re-read this "cache" routine that as well calls
+        * define_event_symbols() :-\
+        */
         if (events[type])
                 return events[type];
  
-       events[type] = event = pevent_find_event(pevent, type);
+       events[type] = event = evsel->tp_format;
         if (!event)
                 return NULL;
  
@@ -214,12 +221,11 @@ struct event_format *find_cache_event(struct pevent *pevent, int type)
         return event;
  }
  
-static void python_process_event(union perf_event *perf_event __unused,
-                                struct pevent *pevent,
+static void python_process_tracepoint(union perf_event *perf_event __unused,
                                  struct perf_sample *sample,
-                                struct perf_evsel *evsel __unused,
+                                struct perf_evsel *evsel,
                                  struct machine *machine __unused,
-                                struct thread *thread)
+                                struct addr_location *al)
  {
         PyObject *handler, *retval, *context, *t, *obj, *dict = NULL;
         static char handler_name[256];
@@ -228,24 +234,22 @@ static void python_process_event(union perf_event *perf_event __unused,
         unsigned long s, ns;
         struct event_format *event;
         unsigned n = 0;
-       int type;
         int pid;
         int cpu = sample->cpu;
         void *data = sample->raw_data;
         unsigned long long nsecs = sample->time;
+       struct thread *thread = al->thread;
         char *comm = thread->comm;
  
         t = PyTuple_New(MAX_FIELDS);
         if (!t)
                 Py_FatalError("couldn't create Python tuple");
  
-       type = trace_parse_common_type(pevent, data);
-
-       event = find_cache_event(pevent, type);
+       event = find_cache_event(evsel);
         if (!event)
-               die("ug! no event found for type %d", type);
+               die("ug! no event found for type %d", (int)evsel->attr.config);
  
-       pid = trace_parse_common_pid(pevent, data);
+       pid = raw_field_value(event, "common_pid", data);
  
         sprintf(handler_name, "%s__%s", event->system, event->name);
  
@@ -290,7 +294,7 @@ static void python_process_event(union perf_event *perf_event __unused,
                                 offset = field->offset;
                         obj = PyString_FromString((char *)data + offset);
                 } else { /* FIELD_IS_NUMERIC */
-                       val = read_size(pevent, data + field->offset,
+                       val = read_size(event, data + field->offset,
                                         field->size);
                         if (field->flags & FIELD_IS_SIGNED) {
                                 if ((long long)val >= LONG_MIN &&
@@ -335,6 +339,83 @@ static void python_process_event(union perf_event *perf_event __unused,
         Py_DECREF(t);
  }
  
+static void python_process_general_event(union perf_event *perf_event __unused,
+                                        struct perf_sample *sample,
+                                        struct perf_evsel *evsel,
+                                        struct machine *machine __unused,
+                                        struct addr_location *al)
+{
+       PyObject *handler, *retval, *t, *dict;
+       static char handler_name[64];
+       unsigned n = 0;
+       struct thread *thread = al->thread;
+
+       /*
+        * Use the MAX_FIELDS to make the function expandable, though
+        * currently there is only one item for the tuple.
+        */
+       t = PyTuple_New(MAX_FIELDS);
+       if (!t)
+               Py_FatalError("couldn't create Python tuple");
+
+       dict = PyDict_New();
+       if (!dict)
+               Py_FatalError("couldn't create Python dictionary");
+
+       snprintf(handler_name, sizeof(handler_name), "%s", "process_event");
+
+       handler = PyDict_GetItemString(main_dict, handler_name);
+       if (!handler || !PyCallable_Check(handler))
+               goto exit;
+
+       PyDict_SetItemString(dict, "ev_name", PyString_FromString(perf_evsel__name(evsel)));
+       PyDict_SetItemString(dict, "attr", PyString_FromStringAndSize(
+                       (const char *)&evsel->attr, sizeof(evsel->attr)));
+       PyDict_SetItemString(dict, "sample", PyString_FromStringAndSize(
+                       (const char *)sample, sizeof(*sample)));
+       PyDict_SetItemString(dict, "raw_buf", PyString_FromStringAndSize(
+                       (const char *)sample->raw_data, sample->raw_size));
+       PyDict_SetItemString(dict, "comm",
+                       PyString_FromString(thread->comm));
+       if (al->map) {
+               PyDict_SetItemString(dict, "dso",
+                       PyString_FromString(al->map->dso->name));
+       }
+       if (al->sym) {
+               PyDict_SetItemString(dict, "symbol",
+                       PyString_FromString(al->sym->name));
+       }
+
+       PyTuple_SetItem(t, n++, dict);
+       if (_PyTuple_Resize(&t, n) == -1)
+               Py_FatalError("error resizing Python tuple");
+
+       retval = PyObject_CallObject(handler, t);
+       if (retval == NULL)
+               handler_call_die(handler_name);
+exit:
+       Py_DECREF(dict);
+       Py_DECREF(t);
+}
+
+static void python_process_event(union perf_event *perf_event,
+                                struct perf_sample *sample,
+                                struct perf_evsel *evsel,
+                                struct machine *machine,
+                                struct addr_location *al)
+{
+       switch (evsel->attr.type) {
+       case PERF_TYPE_TRACEPOINT:
+               python_process_tracepoint(perf_event, sample, evsel,
+                                         machine, al);
+               break;
+       /* Reserve for future process_hw/sw/raw APIs */
+       default:
+               python_process_general_event(perf_event, sample, evsel,
+                                            machine, al);
+       }
+}
+
  static int run_start_sub(void)
  {
         PyObject *handler, *retval;
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c

index 2437fb0b463a21566ad9c8679861456e4f2b3f05..f7bb7ae328dae16dad78600dc70dd9bba3b4b0ac 100644 (file)
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -15,6 +15,8 @@
  #include "util.h"
  #include "cpumap.h"
  #include "event-parse.h"
+#include "perf_regs.h"
+#include "unwind.h"
  
  static int perf_session__open(struct perf_session *self, bool force)
  {
@@ -288,10 +290,11 @@ struct branch_info *machine__resolve_bstack(struct machine *self,
         return bi;
  }
  
-int machine__resolve_callchain(struct machine *self,
-                              struct thread *thread,
-                              struct ip_callchain *chain,
-                              struct symbol **parent)
+static int machine__resolve_callchain_sample(struct machine *machine,
+                                            struct thread *thread,
+                                            struct ip_callchain *chain,
+                                            struct symbol **parent)
+
  {
         u8 cpumode = PERF_RECORD_MISC_USER;
         unsigned int i;
@@ -316,11 +319,14 @@ int machine__resolve_callchain(struct machine *self,
                 if (ip >= PERF_CONTEXT_MAX) {
                         switch (ip) {
                         case PERF_CONTEXT_HV:
-                               cpumode = PERF_RECORD_MISC_HYPERVISOR;  break;
+                               cpumode = PERF_RECORD_MISC_HYPERVISOR;
+                               break;
                         case PERF_CONTEXT_KERNEL:
-                               cpumode = PERF_RECORD_MISC_KERNEL;      break;
+                               cpumode = PERF_RECORD_MISC_KERNEL;
+                               break;
                         case PERF_CONTEXT_USER:
-                               cpumode = PERF_RECORD_MISC_USER;        break;
+                               cpumode = PERF_RECORD_MISC_USER;
+                               break;
                         default:
                                 pr_debug("invalid callchain context: "
                                          "%"PRId64"\n", (s64) ip);
@@ -335,7 +341,7 @@ int machine__resolve_callchain(struct machine *self,
                 }
  
                 al.filtered = false;
-               thread__find_addr_location(thread, self, cpumode,
+               thread__find_addr_location(thread, machine, cpumode,
                                            MAP__FUNCTION, ip, &al, NULL);
                 if (al.sym != NULL) {
                         if (sort__has_parent && !*parent &&
@@ -354,6 +360,40 @@ int machine__resolve_callchain(struct machine *self,
         return 0;
  }
  
+static int unwind_entry(struct unwind_entry *entry, void *arg)
+{
+       struct callchain_cursor *cursor = arg;
+       return callchain_cursor_append(cursor, entry->ip,
+                                      entry->map, entry->sym);
+}
+
+int machine__resolve_callchain(struct machine *machine,
+                              struct perf_evsel *evsel,
+                              struct thread *thread,
+                              struct perf_sample *sample,
+                              struct symbol **parent)
+
+{
+       int ret;
+
+       callchain_cursor_reset(&callchain_cursor);
+
+       ret = machine__resolve_callchain_sample(machine, thread,
+                                               sample->callchain, parent);
+       if (ret)
+               return ret;
+
+       /* Can we do dwarf post unwind? */
+       if (!((evsel->attr.sample_type & PERF_SAMPLE_REGS_USER) &&
+             (evsel->attr.sample_type & PERF_SAMPLE_STACK_USER)))
+               return 0;
+
+       return unwind__get_entries(unwind_entry, &callchain_cursor, machine,
+                                  thread, evsel->attr.sample_regs_user,
+                                  sample);
+
+}
+
  static int process_event_synth_tracing_data_stub(union perf_event *event __used,
                                                  struct perf_session *session __used)
  {
@@ -860,6 +900,34 @@ static void branch_stack__printf(struct perf_sample *sample)
                         sample->branch_stack->entries[i].to);
  }
  
+static void regs_dump__printf(u64 mask, u64 *regs)
+{
+       unsigned rid, i = 0;
+
+       for_each_set_bit(rid, (unsigned long *) &mask, sizeof(mask) * 8) {
+               u64 val = regs[i++];
+
+               printf(".... %-5s 0x%" PRIx64 "\n",
+                      perf_reg_name(rid), val);
+       }
+}
+
+static void regs_user__printf(struct perf_sample *sample, u64 mask)
+{
+       struct regs_dump *user_regs = &sample->user_regs;
+
+       if (user_regs->regs) {
+               printf("... user regs: mask 0x%" PRIx64 "\n", mask);
+               regs_dump__printf(mask, user_regs->regs);
+       }
+}
+
+static void stack_user__printf(struct stack_dump *dump)
+{
+       printf("... ustack: size %" PRIu64 ", offset 0x%x\n",
+              dump->size, dump->offset);
+}
+
  static void perf_session__print_tstamp(struct perf_session *session,
                                        union perf_event *event,
                                        struct perf_sample *sample)
@@ -897,7 +965,7 @@ static void dump_event(struct perf_session *session, union perf_event *event,
                event->header.size, perf_event__name(event->header.type));
  }
  
-static void dump_sample(struct perf_session *session, union perf_event *event,
+static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
                         struct perf_sample *sample)
  {
         u64 sample_type;
@@ -909,13 +977,19 @@ static void dump_sample(struct perf_session *session, union perf_event *event,
                event->header.misc, sample->pid, sample->tid, sample->ip,
                sample->period, sample->addr);
  
-       sample_type = perf_evlist__sample_type(session->evlist);
+       sample_type = evsel->attr.sample_type;
  
         if (sample_type & PERF_SAMPLE_CALLCHAIN)
                 callchain__printf(sample);
  
         if (sample_type & PERF_SAMPLE_BRANCH_STACK)
                 branch_stack__printf(sample);
+
+       if (sample_type & PERF_SAMPLE_REGS_USER)
+               regs_user__printf(sample, evsel->attr.sample_regs_user);
+
+       if (sample_type & PERF_SAMPLE_STACK_USER)
+               stack_user__printf(&sample->user_stack);
  }
  
  static struct machine *
@@ -973,7 +1047,7 @@ static int perf_session_deliver_event(struct perf_session *session,
  
         switch (event->header.type) {
         case PERF_RECORD_SAMPLE:
-               dump_sample(session, event, sample);
+               dump_sample(evsel, event, sample);
                 if (evsel == NULL) {
                         ++session->hists.stats.nr_unknown_id;
                         return 0;
@@ -1498,9 +1572,9 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session,
         return NULL;
  }
  
-void perf_event__print_ip(union perf_event *event, struct perf_sample *sample,
-                         struct machine *machine, int print_sym,
-                         int print_dso, int print_symoffset)
+void perf_evsel__print_ip(struct perf_evsel *evsel, union perf_event *event,
+                         struct perf_sample *sample, struct machine *machine,
+                         int print_sym, int print_dso, int print_symoffset)
  {
         struct addr_location al;
         struct callchain_cursor_node *node;
@@ -1514,8 +1588,9 @@ void perf_event__print_ip(union perf_event *event, struct perf_sample *sample,
  
         if (symbol_conf.use_callchain && sample->callchain) {
  
-               if (machine__resolve_callchain(machine, al.thread,
-                                               sample->callchain, NULL) != 0) {
+
+               if (machine__resolve_callchain(machine, evsel, al.thread,
+                                              sample, NULL) != 0) {
                         if (verbose)
                                 error("Failed to resolve callchain. Skipping\n");
                         return;
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h

index 1f7ec87db7d7369083895d596a979db460865d23..176a60902f569abe5156df13ea7c4a3a9ceff461 100644 (file)
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -129,9 +129,9 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp);
  struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session,
                                             unsigned int type);
  
-void perf_event__print_ip(union perf_event *event, struct perf_sample *sample,
-                         struct machine *machine, int print_sym,
-                         int print_dso, int print_symoffset);
+void perf_evsel__print_ip(struct perf_evsel *evsel, union perf_event *event,
+                         struct perf_sample *sample, struct machine *machine,
+                         int print_sym, int print_dso, int print_symoffset);
  
  int perf_session__cpu_bitmap(struct perf_session *session,
                              const char *cpu_list, unsigned long *cpu_bitmap);
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c

new file mode 100644 (file)

index 0000000..db0cc92
--- /dev/null
+++ b/tools/perf/util/symbol-elf.c
@@ -0,0 +1,841 @@
+#include <libelf.h>
+#include <gelf.h>
+#include <elf.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+#include "symbol.h"
+#include "debug.h"
+
+#ifndef NT_GNU_BUILD_ID
+#define NT_GNU_BUILD_ID 3
+#endif
+
+/**
+ * elf_symtab__for_each_symbol - iterate thru all the symbols
+ *
+ * @syms: struct elf_symtab instance to iterate
+ * @idx: uint32_t idx
+ * @sym: GElf_Sym iterator
+ */
+#define elf_symtab__for_each_symbol(syms, nr_syms, idx, sym) \
+       for (idx = 0, gelf_getsym(syms, idx, &sym);\
+            idx < nr_syms; \
+            idx++, gelf_getsym(syms, idx, &sym))
+
+static inline uint8_t elf_sym__type(const GElf_Sym *sym)
+{
+       return GELF_ST_TYPE(sym->st_info);
+}
+
+static inline int elf_sym__is_function(const GElf_Sym *sym)
+{
+       return elf_sym__type(sym) == STT_FUNC &&
+              sym->st_name != 0 &&
+              sym->st_shndx != SHN_UNDEF;
+}
+
+static inline bool elf_sym__is_object(const GElf_Sym *sym)
+{
+       return elf_sym__type(sym) == STT_OBJECT &&
+               sym->st_name != 0 &&
+               sym->st_shndx != SHN_UNDEF;
+}
+
+static inline int elf_sym__is_label(const GElf_Sym *sym)
+{
+       return elf_sym__type(sym) == STT_NOTYPE &&
+               sym->st_name != 0 &&
+               sym->st_shndx != SHN_UNDEF &&
+               sym->st_shndx != SHN_ABS;
+}
+
+static bool elf_sym__is_a(GElf_Sym *sym, enum map_type type)
+{
+       switch (type) {
+       case MAP__FUNCTION:
+               return elf_sym__is_function(sym);
+       case MAP__VARIABLE:
+               return elf_sym__is_object(sym);
+       default:
+               return false;
+       }
+}
+
+static inline const char *elf_sym__name(const GElf_Sym *sym,
+                                       const Elf_Data *symstrs)
+{
+       return symstrs->d_buf + sym->st_name;
+}
+
+static inline const char *elf_sec__name(const GElf_Shdr *shdr,
+                                       const Elf_Data *secstrs)
+{
+       return secstrs->d_buf + shdr->sh_name;
+}
+
+static inline int elf_sec__is_text(const GElf_Shdr *shdr,
+                                       const Elf_Data *secstrs)
+{
+       return strstr(elf_sec__name(shdr, secstrs), "text") != NULL;
+}
+
+static inline bool elf_sec__is_data(const GElf_Shdr *shdr,
+                                   const Elf_Data *secstrs)
+{
+       return strstr(elf_sec__name(shdr, secstrs), "data") != NULL;
+}
+
+static bool elf_sec__is_a(GElf_Shdr *shdr, Elf_Data *secstrs,
+                         enum map_type type)
+{
+       switch (type) {
+       case MAP__FUNCTION:
+               return elf_sec__is_text(shdr, secstrs);
+       case MAP__VARIABLE:
+               return elf_sec__is_data(shdr, secstrs);
+       default:
+               return false;
+       }
+}
+
+static size_t elf_addr_to_index(Elf *elf, GElf_Addr addr)
+{
+       Elf_Scn *sec = NULL;
+       GElf_Shdr shdr;
+       size_t cnt = 1;
+
+       while ((sec = elf_nextscn(elf, sec)) != NULL) {
+               gelf_getshdr(sec, &shdr);
+
+               if ((addr >= shdr.sh_addr) &&
+                   (addr < (shdr.sh_addr + shdr.sh_size)))
+                       return cnt;
+
+               ++cnt;
+       }
+
+       return -1;
+}
+
+static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
+                                   GElf_Shdr *shp, const char *name,
+                                   size_t *idx)
+{
+       Elf_Scn *sec = NULL;
+       size_t cnt = 1;
+
+       /* Elf is corrupted/truncated, avoid calling elf_strptr. */
+       if (!elf_rawdata(elf_getscn(elf, ep->e_shstrndx), NULL))
+               return NULL;
+
+       while ((sec = elf_nextscn(elf, sec)) != NULL) {
+               char *str;
+
+               gelf_getshdr(sec, shp);
+               str = elf_strptr(elf, ep->e_shstrndx, shp->sh_name);
+               if (!strcmp(name, str)) {
+                       if (idx)
+                               *idx = cnt;
+                       break;
+               }
+               ++cnt;
+       }
+
+       return sec;
+}
+
+#define elf_section__for_each_rel(reldata, pos, pos_mem, idx, nr_entries) \
+       for (idx = 0, pos = gelf_getrel(reldata, 0, &pos_mem); \
+            idx < nr_entries; \
+            ++idx, pos = gelf_getrel(reldata, idx, &pos_mem))
+
+#define elf_section__for_each_rela(reldata, pos, pos_mem, idx, nr_entries) \
+       for (idx = 0, pos = gelf_getrela(reldata, 0, &pos_mem); \
+            idx < nr_entries; \
+            ++idx, pos = gelf_getrela(reldata, idx, &pos_mem))
+
+/*
+ * We need to check if we have a .dynsym, so that we can handle the
+ * .plt, synthesizing its symbols, that aren't on the symtabs (be it
+ * .dynsym or .symtab).
+ * And always look at the original dso, not at debuginfo packages, that
+ * have the PLT data stripped out (shdr_rel_plt.sh_type == SHT_NOBITS).
+ */
+int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map *map,
+                               symbol_filter_t filter)
+{
+       uint32_t nr_rel_entries, idx;
+       GElf_Sym sym;
+       u64 plt_offset;
+       GElf_Shdr shdr_plt;
+       struct symbol *f;
+       GElf_Shdr shdr_rel_plt, shdr_dynsym;
+       Elf_Data *reldata, *syms, *symstrs;
+       Elf_Scn *scn_plt_rel, *scn_symstrs, *scn_dynsym;
+       size_t dynsym_idx;
+       GElf_Ehdr ehdr;
+       char sympltname[1024];
+       Elf *elf;
+       int nr = 0, symidx, err = 0;
+
+       if (!ss->dynsym)
+               return 0;
+
+       elf = ss->elf;
+       ehdr = ss->ehdr;
+
+       scn_dynsym = ss->dynsym;
+       shdr_dynsym = ss->dynshdr;
+       dynsym_idx = ss->dynsym_idx;
+
+       if (scn_dynsym == NULL)
+               goto out_elf_end;
+
+       scn_plt_rel = elf_section_by_name(elf, &ehdr, &shdr_rel_plt,
+                                         ".rela.plt", NULL);
+       if (scn_plt_rel == NULL) {
+               scn_plt_rel = elf_section_by_name(elf, &ehdr, &shdr_rel_plt,
+                                                 ".rel.plt", NULL);
+               if (scn_plt_rel == NULL)
+                       goto out_elf_end;
+       }
+
+       err = -1;
+
+       if (shdr_rel_plt.sh_link != dynsym_idx)
+               goto out_elf_end;
+
+       if (elf_section_by_name(elf, &ehdr, &shdr_plt, ".plt", NULL) == NULL)
+               goto out_elf_end;
+
+       /*
+        * Fetch the relocation section to find the idxes to the GOT
+        * and the symbols in the .dynsym they refer to.
+        */
+       reldata = elf_getdata(scn_plt_rel, NULL);
+       if (reldata == NULL)
+               goto out_elf_end;
+
+       syms = elf_getdata(scn_dynsym, NULL);
+       if (syms == NULL)
+               goto out_elf_end;
+
+       scn_symstrs = elf_getscn(elf, shdr_dynsym.sh_link);
+       if (scn_symstrs == NULL)
+               goto out_elf_end;
+
+       symstrs = elf_getdata(scn_symstrs, NULL);
+       if (symstrs == NULL)
+               goto out_elf_end;
+
+       if (symstrs->d_size == 0)
+               goto out_elf_end;
+
+       nr_rel_entries = shdr_rel_plt.sh_size / shdr_rel_plt.sh_entsize;
+       plt_offset = shdr_plt.sh_offset;
+
+       if (shdr_rel_plt.sh_type == SHT_RELA) {
+               GElf_Rela pos_mem, *pos;
+
+               elf_section__for_each_rela(reldata, pos, pos_mem, idx,
+                                          nr_rel_entries) {
+                       symidx = GELF_R_SYM(pos->r_info);
+                       plt_offset += shdr_plt.sh_entsize;
+                       gelf_getsym(syms, symidx, &sym);
+                       snprintf(sympltname, sizeof(sympltname),
+                                "%s@plt", elf_sym__name(&sym, symstrs));
+
+                       f = symbol__new(plt_offset, shdr_plt.sh_entsize,
+                                       STB_GLOBAL, sympltname);
+                       if (!f)
+                               goto out_elf_end;
+
+                       if (filter && filter(map, f))
+                               symbol__delete(f);
+                       else {
+                               symbols__insert(&dso->symbols[map->type], f);
+                               ++nr;
+                       }
+               }
+       } else if (shdr_rel_plt.sh_type == SHT_REL) {
+               GElf_Rel pos_mem, *pos;
+               elf_section__for_each_rel(reldata, pos, pos_mem, idx,
+                                         nr_rel_entries) {
+                       symidx = GELF_R_SYM(pos->r_info);
+                       plt_offset += shdr_plt.sh_entsize;
+                       gelf_getsym(syms, symidx, &sym);
+                       snprintf(sympltname, sizeof(sympltname),
+                                "%s@plt", elf_sym__name(&sym, symstrs));
+
+                       f = symbol__new(plt_offset, shdr_plt.sh_entsize,
+                                       STB_GLOBAL, sympltname);
+                       if (!f)
+                               goto out_elf_end;
+
+                       if (filter && filter(map, f))
+                               symbol__delete(f);
+                       else {
+                               symbols__insert(&dso->symbols[map->type], f);
+                               ++nr;
+                       }
+               }
+       }
+
+       err = 0;
+out_elf_end:
+       if (err == 0)
+               return nr;
+       pr_debug("%s: problems reading %s PLT info.\n",
+                __func__, dso->long_name);
+       return 0;
+}
+
+/*
+ * Align offset to 4 bytes as needed for note name and descriptor data.
+ */
+#define NOTE_ALIGN(n) (((n) + 3) & -4U)
+
+static int elf_read_build_id(Elf *elf, void *bf, size_t size)
+{
+       int err = -1;
+       GElf_Ehdr ehdr;
+       GElf_Shdr shdr;
+       Elf_Data *data;
+       Elf_Scn *sec;
+       Elf_Kind ek;
+       void *ptr;
+
+       if (size < BUILD_ID_SIZE)
+               goto out;
+
+       ek = elf_kind(elf);
+       if (ek != ELF_K_ELF)
+               goto out;
+
+       if (gelf_getehdr(elf, &ehdr) == NULL) {
+               pr_err("%s: cannot get elf header.\n", __func__);
+               goto out;
+       }
+
+       /*
+        * Check following sections for notes:
+        *   '.note.gnu.build-id'
+        *   '.notes'
+        *   '.note' (VDSO specific)
+        */
+       do {
+               sec = elf_section_by_name(elf, &ehdr, &shdr,
+                                         ".note.gnu.build-id", NULL);
+               if (sec)
+                       break;
+
+               sec = elf_section_by_name(elf, &ehdr, &shdr,
+                                         ".notes", NULL);
+               if (sec)
+                       break;
+
+               sec = elf_section_by_name(elf, &ehdr, &shdr,
+                                         ".note", NULL);
+               if (sec)
+                       break;
+
+               return err;
+
+       } while (0);
+
+       data = elf_getdata(sec, NULL);
+       if (data == NULL)
+               goto out;
+
+       ptr = data->d_buf;
+       while (ptr < (data->d_buf + data->d_size)) {
+               GElf_Nhdr *nhdr = ptr;
+               size_t namesz = NOTE_ALIGN(nhdr->n_namesz),
+                      descsz = NOTE_ALIGN(nhdr->n_descsz);
+               const char *name;
+
+               ptr += sizeof(*nhdr);
+               name = ptr;
+               ptr += namesz;
+               if (nhdr->n_type == NT_GNU_BUILD_ID &&
+                   nhdr->n_namesz == sizeof("GNU")) {
+                       if (memcmp(name, "GNU", sizeof("GNU")) == 0) {
+                               size_t sz = min(size, descsz);
+                               memcpy(bf, ptr, sz);
+                               memset(bf + sz, 0, size - sz);
+                               err = descsz;
+                               break;
+                       }
+               }
+               ptr += descsz;
+       }
+
+out:
+       return err;
+}
+
+int filename__read_build_id(const char *filename, void *bf, size_t size)
+{
+       int fd, err = -1;
+       Elf *elf;
+
+       if (size < BUILD_ID_SIZE)
+               goto out;
+
+       fd = open(filename, O_RDONLY);
+       if (fd < 0)
+               goto out;
+
+       elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
+       if (elf == NULL) {
+               pr_debug2("%s: cannot read %s ELF file.\n", __func__, filename);
+               goto out_close;
+       }
+
+       err = elf_read_build_id(elf, bf, size);
+
+       elf_end(elf);
+out_close:
+       close(fd);
+out:
+       return err;
+}
+
+int sysfs__read_build_id(const char *filename, void *build_id, size_t size)
+{
+       int fd, err = -1;
+
+       if (size < BUILD_ID_SIZE)
+               goto out;
+
+       fd = open(filename, O_RDONLY);
+       if (fd < 0)
+               goto out;
+
+       while (1) {
+               char bf[BUFSIZ];
+               GElf_Nhdr nhdr;
+               size_t namesz, descsz;
+
+               if (read(fd, &nhdr, sizeof(nhdr)) != sizeof(nhdr))
+                       break;
+
+               namesz = NOTE_ALIGN(nhdr.n_namesz);
+               descsz = NOTE_ALIGN(nhdr.n_descsz);
+               if (nhdr.n_type == NT_GNU_BUILD_ID &&
+                   nhdr.n_namesz == sizeof("GNU")) {
+                       if (read(fd, bf, namesz) != (ssize_t)namesz)
+                               break;
+                       if (memcmp(bf, "GNU", sizeof("GNU")) == 0) {
+                               size_t sz = min(descsz, size);
+                               if (read(fd, build_id, sz) == (ssize_t)sz) {
+                                       memset(build_id + sz, 0, size - sz);
+                                       err = 0;
+                                       break;
+                               }
+                       } else if (read(fd, bf, descsz) != (ssize_t)descsz)
+                               break;
+               } else {
+                       int n = namesz + descsz;
+                       if (read(fd, bf, n) != n)
+                               break;
+               }
+       }
+       close(fd);
+out:
+       return err;
+}
+
+int filename__read_debuglink(const char *filename, char *debuglink,
+                            size_t size)
+{
+       int fd, err = -1;
+       Elf *elf;
+       GElf_Ehdr ehdr;
+       GElf_Shdr shdr;
+       Elf_Data *data;
+       Elf_Scn *sec;
+       Elf_Kind ek;
+
+       fd = open(filename, O_RDONLY);
+       if (fd < 0)
+               goto out;
+
+       elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
+       if (elf == NULL) {
+               pr_debug2("%s: cannot read %s ELF file.\n", __func__, filename);
+               goto out_close;
+       }
+
+       ek = elf_kind(elf);
+       if (ek != ELF_K_ELF)
+               goto out_close;
+
+       if (gelf_getehdr(elf, &ehdr) == NULL) {
+               pr_err("%s: cannot get elf header.\n", __func__);
+               goto out_close;
+       }
+
+       sec = elf_section_by_name(elf, &ehdr, &shdr,
+                                 ".gnu_debuglink", NULL);
+       if (sec == NULL)
+               goto out_close;
+
+       data = elf_getdata(sec, NULL);
+       if (data == NULL)
+               goto out_close;
+
+       /* the start of this section is a zero-terminated string */
+       strncpy(debuglink, data->d_buf, size);
+
+       elf_end(elf);
+
+out_close:
+       close(fd);
+out:
+       return err;
+}
+
+static int dso__swap_init(struct dso *dso, unsigned char eidata)
+{
+       static unsigned int const endian = 1;
+
+       dso->needs_swap = DSO_SWAP__NO;
+
+       switch (eidata) {
+       case ELFDATA2LSB:
+               /* We are big endian, DSO is little endian. */
+               if (*(unsigned char const *)&endian != 1)
+                       dso->needs_swap = DSO_SWAP__YES;
+               break;
+
+       case ELFDATA2MSB:
+               /* We are little endian, DSO is big endian. */
+               if (*(unsigned char const *)&endian != 0)
+                       dso->needs_swap = DSO_SWAP__YES;
+               break;
+
+       default:
+               pr_err("unrecognized DSO data encoding %d\n", eidata);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+bool symsrc__possibly_runtime(struct symsrc *ss)
+{
+       return ss->dynsym || ss->opdsec;
+}
+
+bool symsrc__has_symtab(struct symsrc *ss)
+{
+       return ss->symtab != NULL;
+}
+
+void symsrc__destroy(struct symsrc *ss)
+{
+       free(ss->name);
+       elf_end(ss->elf);
+       close(ss->fd);
+}
+
+int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
+                enum dso_binary_type type)
+{
+       int err = -1;
+       GElf_Ehdr ehdr;
+       Elf *elf;
+       int fd;
+
+       fd = open(name, O_RDONLY);
+       if (fd < 0)
+               return -1;
+
+       elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
+       if (elf == NULL) {
+               pr_debug("%s: cannot read %s ELF file.\n", __func__, name);
+               goto out_close;
+       }
+
+       if (gelf_getehdr(elf, &ehdr) == NULL) {
+               pr_debug("%s: cannot get elf header.\n", __func__);
+               goto out_elf_end;
+       }
+
+       if (dso__swap_init(dso, ehdr.e_ident[EI_DATA]))
+               goto out_elf_end;
+
+       /* Always reject images with a mismatched build-id: */
+       if (dso->has_build_id) {
+               u8 build_id[BUILD_ID_SIZE];
+
+               if (elf_read_build_id(elf, build_id, BUILD_ID_SIZE) < 0)
+                       goto out_elf_end;
+
+               if (!dso__build_id_equal(dso, build_id))
+                       goto out_elf_end;
+       }
+
+       ss->symtab = elf_section_by_name(elf, &ehdr, &ss->symshdr, ".symtab",
+                       NULL);
+       if (ss->symshdr.sh_type != SHT_SYMTAB)
+               ss->symtab = NULL;
+
+       ss->dynsym_idx = 0;
+       ss->dynsym = elf_section_by_name(elf, &ehdr, &ss->dynshdr, ".dynsym",
+                       &ss->dynsym_idx);
+       if (ss->dynshdr.sh_type != SHT_DYNSYM)
+               ss->dynsym = NULL;
+
+       ss->opdidx = 0;
+       ss->opdsec = elf_section_by_name(elf, &ehdr, &ss->opdshdr, ".opd",
+                       &ss->opdidx);
+       if (ss->opdshdr.sh_type != SHT_PROGBITS)
+               ss->opdsec = NULL;
+
+       if (dso->kernel == DSO_TYPE_USER) {
+               GElf_Shdr shdr;
+               ss->adjust_symbols = (ehdr.e_type == ET_EXEC ||
+                               elf_section_by_name(elf, &ehdr, &shdr,
+                                                    ".gnu.prelink_undo",
+                                                    NULL) != NULL);
+       } else {
+               ss->adjust_symbols = 0;
+       }
+
+       ss->name   = strdup(name);
+       if (!ss->name)
+               goto out_elf_end;
+
+       ss->elf    = elf;
+       ss->fd     = fd;
+       ss->ehdr   = ehdr;
+       ss->type   = type;
+
+       return 0;
+
+out_elf_end:
+       elf_end(elf);
+out_close:
+       close(fd);
+       return err;
+}
+
+int dso__load_sym(struct dso *dso, struct map *map,
+                 struct symsrc *syms_ss, struct symsrc *runtime_ss,
+                 symbol_filter_t filter, int kmodule)
+{
+       struct kmap *kmap = dso->kernel ? map__kmap(map) : NULL;
+       struct map *curr_map = map;
+       struct dso *curr_dso = dso;
+       Elf_Data *symstrs, *secstrs;
+       uint32_t nr_syms;
+       int err = -1;
+       uint32_t idx;
+       GElf_Ehdr ehdr;
+       GElf_Shdr shdr;
+       Elf_Data *syms, *opddata = NULL;
+       GElf_Sym sym;
+       Elf_Scn *sec, *sec_strndx;
+       Elf *elf;
+       int nr = 0;
+
+       dso->symtab_type = syms_ss->type;
+
+       if (!syms_ss->symtab) {
+               syms_ss->symtab  = syms_ss->dynsym;
+               syms_ss->symshdr = syms_ss->dynshdr;
+       }
+
+       elf = syms_ss->elf;
+       ehdr = syms_ss->ehdr;
+       sec = syms_ss->symtab;
+       shdr = syms_ss->symshdr;
+
+       if (runtime_ss->opdsec)
+               opddata = elf_rawdata(runtime_ss->opdsec, NULL);
+
+       syms = elf_getdata(sec, NULL);
+       if (syms == NULL)
+               goto out_elf_end;
+
+       sec = elf_getscn(elf, shdr.sh_link);
+       if (sec == NULL)
+               goto out_elf_end;
+
+       symstrs = elf_getdata(sec, NULL);
+       if (symstrs == NULL)
+               goto out_elf_end;
+
+       sec_strndx = elf_getscn(elf, ehdr.e_shstrndx);
+       if (sec_strndx == NULL)
+               goto out_elf_end;
+
+       secstrs = elf_getdata(sec_strndx, NULL);
+       if (secstrs == NULL)
+               goto out_elf_end;
+
+       nr_syms = shdr.sh_size / shdr.sh_entsize;
+
+       memset(&sym, 0, sizeof(sym));
+       dso->adjust_symbols = runtime_ss->adjust_symbols;
+       elf_symtab__for_each_symbol(syms, nr_syms, idx, sym) {
+               struct symbol *f;
+               const char *elf_name = elf_sym__name(&sym, symstrs);
+               char *demangled = NULL;
+               int is_label = elf_sym__is_label(&sym);
+               const char *section_name;
+               bool used_opd = false;
+
+               if (kmap && kmap->ref_reloc_sym && kmap->ref_reloc_sym->name &&
+                   strcmp(elf_name, kmap->ref_reloc_sym->name) == 0)
+                       kmap->ref_reloc_sym->unrelocated_addr = sym.st_value;
+
+               if (!is_label && !elf_sym__is_a(&sym, map->type))
+                       continue;
+
+               /* Reject ARM ELF "mapping symbols": these aren't unique and
+                * don't identify functions, so will confuse the profile
+                * output: */
+               if (ehdr.e_machine == EM_ARM) {
+                       if (!strcmp(elf_name, "$a") ||
+                           !strcmp(elf_name, "$d") ||
+                           !strcmp(elf_name, "$t"))
+                               continue;
+               }
+
+               if (runtime_ss->opdsec && sym.st_shndx == runtime_ss->opdidx) {
+                       u32 offset = sym.st_value - syms_ss->opdshdr.sh_addr;
+                       u64 *opd = opddata->d_buf + offset;
+                       sym.st_value = DSO__SWAP(dso, u64, *opd);
+                       sym.st_shndx = elf_addr_to_index(runtime_ss->elf,
+                                       sym.st_value);
+                       used_opd = true;
+               }
+
+               sec = elf_getscn(runtime_ss->elf, sym.st_shndx);
+               if (!sec)
+                       goto out_elf_end;
+
+               gelf_getshdr(sec, &shdr);
+
+               if (is_label && !elf_sec__is_a(&shdr, secstrs, map->type))
+                       continue;
+
+               section_name = elf_sec__name(&shdr, secstrs);
+
+               /* On ARM, symbols for thumb functions have 1 added to
+                * the symbol address as a flag - remove it */
+               if ((ehdr.e_machine == EM_ARM) &&
+                   (map->type == MAP__FUNCTION) &&
+                   (sym.st_value & 1))
+                       --sym.st_value;
+
+               if (dso->kernel != DSO_TYPE_USER || kmodule) {
+                       char dso_name[PATH_MAX];
+
+                       if (strcmp(section_name,
+                                  (curr_dso->short_name +
+                                   dso->short_name_len)) == 0)
+                               goto new_symbol;
+
+                       if (strcmp(section_name, ".text") == 0) {
+                               curr_map = map;
+                               curr_dso = dso;
+                               goto new_symbol;
+                       }
+
+                       snprintf(dso_name, sizeof(dso_name),
+                                "%s%s", dso->short_name, section_name);
+
+                       curr_map = map_groups__find_by_name(kmap->kmaps, map->type, dso_name);
+                       if (curr_map == NULL) {
+                               u64 start = sym.st_value;
+
+                               if (kmodule)
+                                       start += map->start + shdr.sh_offset;
+
+                               curr_dso = dso__new(dso_name);
+                               if (curr_dso == NULL)
+                                       goto out_elf_end;
+                               curr_dso->kernel = dso->kernel;
+                               curr_dso->long_name = dso->long_name;
+                               curr_dso->long_name_len = dso->long_name_len;
+                               curr_map = map__new2(start, curr_dso,
+                                                    map->type);
+                               if (curr_map == NULL) {
+                                       dso__delete(curr_dso);
+                                       goto out_elf_end;
+                               }
+                               curr_map->map_ip = identity__map_ip;
+                               curr_map->unmap_ip = identity__map_ip;
+                               curr_dso->symtab_type = dso->symtab_type;
+                               map_groups__insert(kmap->kmaps, curr_map);
+                               dsos__add(&dso->node, curr_dso);
+                               dso__set_loaded(curr_dso, map->type);
+                       } else
+                               curr_dso = curr_map->dso;
+
+                       goto new_symbol;
+               }
+
+               if ((used_opd && runtime_ss->adjust_symbols)
+                               || (!used_opd && syms_ss->adjust_symbols)) {
+                       pr_debug4("%s: adjusting symbol: st_value: %#" PRIx64 " "
+                                 "sh_addr: %#" PRIx64 " sh_offset: %#" PRIx64 "\n", __func__,
+                                 (u64)sym.st_value, (u64)shdr.sh_addr,
+                                 (u64)shdr.sh_offset);
+                       sym.st_value -= shdr.sh_addr - shdr.sh_offset;
+               }
+               /*
+                * We need to figure out if the object was created from C++ sources
+                * DWARF DW_compile_unit has this, but we don't always have access
+                * to it...
+                */
+               demangled = bfd_demangle(NULL, elf_name, DMGL_PARAMS | DMGL_ANSI);
+               if (demangled != NULL)
+                       elf_name = demangled;
+new_symbol:
+               f = symbol__new(sym.st_value, sym.st_size,
+                               GELF_ST_BIND(sym.st_info), elf_name);
+               free(demangled);
+               if (!f)
+                       goto out_elf_end;
+
+               if (filter && filter(curr_map, f))
+                       symbol__delete(f);
+               else {
+                       symbols__insert(&curr_dso->symbols[curr_map->type], f);
+                       nr++;
+               }
+       }
+
+       /*
+        * For misannotated, zeroed, ASM function sizes.
+        */
+       if (nr > 0) {
+               symbols__fixup_duplicate(&dso->symbols[map->type]);
+               symbols__fixup_end(&dso->symbols[map->type]);
+               if (kmap) {
+                       /*
+                        * We need to fixup this here too because we create new
+                        * maps here, for things like vsyscall sections.
+                        */
+                       __map_groups__fixup_end(kmap->kmaps, map->type);
+               }
+       }
+       err = nr;
+out_elf_end:
+       return err;
+}
+
+void symbol__elf_init(void)
+{
+       elf_version(EV_CURRENT);
+}
diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c

new file mode 100644 (file)

index 0000000..6738ea1
--- /dev/null
+++ b/tools/perf/util/symbol-minimal.c
@@ -0,0 +1,303 @@
+#include "symbol.h"
+
+#include <elf.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <string.h>
+#include <byteswap.h>
+#include <sys/stat.h>
+
+
+static bool check_need_swap(int file_endian)
+{
+       const int data = 1;
+       u8 *check = (u8 *)&data;
+       int host_endian;
+
+       if (check[0] == 1)
+               host_endian = ELFDATA2LSB;
+       else
+               host_endian = ELFDATA2MSB;
+
+       return host_endian != file_endian;
+}
+
+#define NOTE_ALIGN(sz) (((sz) + 3) & ~3)
+
+#define NT_GNU_BUILD_ID        3
+
+static int read_build_id(void *note_data, size_t note_len, void *bf,
+                        size_t size, bool need_swap)
+{
+       struct {
+               u32 n_namesz;
+               u32 n_descsz;
+               u32 n_type;
+       } *nhdr;
+       void *ptr;
+
+       ptr = note_data;
+       while (ptr < (note_data + note_len)) {
+               const char *name;
+               size_t namesz, descsz;
+
+               nhdr = ptr;
+               if (need_swap) {
+                       nhdr->n_namesz = bswap_32(nhdr->n_namesz);
+                       nhdr->n_descsz = bswap_32(nhdr->n_descsz);
+                       nhdr->n_type = bswap_32(nhdr->n_type);
+               }
+
+               namesz = NOTE_ALIGN(nhdr->n_namesz);
+               descsz = NOTE_ALIGN(nhdr->n_descsz);
+
+               ptr += sizeof(*nhdr);
+               name = ptr;
+               ptr += namesz;
+               if (nhdr->n_type == NT_GNU_BUILD_ID &&
+                   nhdr->n_namesz == sizeof("GNU")) {
+                       if (memcmp(name, "GNU", sizeof("GNU")) == 0) {
+                               size_t sz = min(size, descsz);
+                               memcpy(bf, ptr, sz);
+                               memset(bf + sz, 0, size - sz);
+                               return 0;
+                       }
+               }
+               ptr += descsz;
+       }
+
+       return -1;
+}
+
+int filename__read_debuglink(const char *filename __used,
+                            char *debuglink __used, size_t size __used)
+{
+       return -1;
+}
+
+/*
+ * Just try PT_NOTE header otherwise fails
+ */
+int filename__read_build_id(const char *filename, void *bf, size_t size)
+{
+       FILE *fp;
+       int ret = -1;
+       bool need_swap = false;
+       u8 e_ident[EI_NIDENT];
+       size_t buf_size;
+       void *buf;
+       int i;
+
+       fp = fopen(filename, "r");
+       if (fp == NULL)
+               return -1;
+
+       if (fread(e_ident, sizeof(e_ident), 1, fp) != 1)
+               goto out;
+
+       if (memcmp(e_ident, ELFMAG, SELFMAG) ||
+           e_ident[EI_VERSION] != EV_CURRENT)
+               goto out;
+
+       need_swap = check_need_swap(e_ident[EI_DATA]);
+
+       /* for simplicity */
+       fseek(fp, 0, SEEK_SET);
+
+       if (e_ident[EI_CLASS] == ELFCLASS32) {
+               Elf32_Ehdr ehdr;
+               Elf32_Phdr *phdr;
+
+               if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1)
+                       goto out;
+
+               if (need_swap) {
+                       ehdr.e_phoff = bswap_32(ehdr.e_phoff);
+                       ehdr.e_phentsize = bswap_16(ehdr.e_phentsize);
+                       ehdr.e_phnum = bswap_16(ehdr.e_phnum);
+               }
+
+               buf_size = ehdr.e_phentsize * ehdr.e_phnum;
+               buf = malloc(buf_size);
+               if (buf == NULL)
+                       goto out;
+
+               fseek(fp, ehdr.e_phoff, SEEK_SET);
+               if (fread(buf, buf_size, 1, fp) != 1)
+                       goto out_free;
+
+               for (i = 0, phdr = buf; i < ehdr.e_phnum; i++, phdr++) {
+                       void *tmp;
+
+                       if (need_swap) {
+                               phdr->p_type = bswap_32(phdr->p_type);
+                               phdr->p_offset = bswap_32(phdr->p_offset);
+                               phdr->p_filesz = bswap_32(phdr->p_filesz);
+                       }
+
+                       if (phdr->p_type != PT_NOTE)
+                               continue;
+
+                       buf_size = phdr->p_filesz;
+                       tmp = realloc(buf, buf_size);
+                       if (tmp == NULL)
+                               goto out_free;
+
+                       buf = tmp;
+                       fseek(fp, phdr->p_offset, SEEK_SET);
+                       if (fread(buf, buf_size, 1, fp) != 1)
+                               goto out_free;
+
+                       ret = read_build_id(buf, buf_size, bf, size, need_swap);
+                       if (ret == 0)
+                               ret = size;
+                       break;
+               }
+       } else {
+               Elf64_Ehdr ehdr;
+               Elf64_Phdr *phdr;
+
+               if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1)
+                       goto out;
+
+               if (need_swap) {
+                       ehdr.e_phoff = bswap_64(ehdr.e_phoff);
+                       ehdr.e_phentsize = bswap_16(ehdr.e_phentsize);
+                       ehdr.e_phnum = bswap_16(ehdr.e_phnum);
+               }
+
+               buf_size = ehdr.e_phentsize * ehdr.e_phnum;
+               buf = malloc(buf_size);
+               if (buf == NULL)
+                       goto out;
+
+               fseek(fp, ehdr.e_phoff, SEEK_SET);
+               if (fread(buf, buf_size, 1, fp) != 1)
+                       goto out_free;
+
+               for (i = 0, phdr = buf; i < ehdr.e_phnum; i++, phdr++) {
+                       void *tmp;
+
+                       if (need_swap) {
+                               phdr->p_type = bswap_32(phdr->p_type);
+                               phdr->p_offset = bswap_64(phdr->p_offset);
+                               phdr->p_filesz = bswap_64(phdr->p_filesz);
+                       }
+
+                       if (phdr->p_type != PT_NOTE)
+                               continue;
+
+                       buf_size = phdr->p_filesz;
+                       tmp = realloc(buf, buf_size);
+                       if (tmp == NULL)
+                               goto out_free;
+
+                       buf = tmp;
+                       fseek(fp, phdr->p_offset, SEEK_SET);
+                       if (fread(buf, buf_size, 1, fp) != 1)
+                               goto out_free;
+
+                       ret = read_build_id(buf, buf_size, bf, size, need_swap);
+                       if (ret == 0)
+                               ret = size;
+                       break;
+               }
+       }
+out_free:
+       free(buf);
+out:
+       fclose(fp);
+       return ret;
+}
+
+int sysfs__read_build_id(const char *filename, void *build_id, size_t size)
+{
+       int fd;
+       int ret = -1;
+       struct stat stbuf;
+       size_t buf_size;
+       void *buf;
+
+       fd = open(filename, O_RDONLY);
+       if (fd < 0)
+               return -1;
+
+       if (fstat(fd, &stbuf) < 0)
+               goto out;
+
+       buf_size = stbuf.st_size;
+       buf = malloc(buf_size);
+       if (buf == NULL)
+               goto out;
+
+       if (read(fd, buf, buf_size) != (ssize_t) buf_size)
+               goto out_free;
+
+       ret = read_build_id(buf, buf_size, build_id, size, false);
+out_free:
+       free(buf);
+out:
+       close(fd);
+       return ret;
+}
+
+int symsrc__init(struct symsrc *ss, struct dso *dso __used, const char *name,
+                enum dso_binary_type type)
+{
+       int fd = open(name, O_RDONLY);
+       if (fd < 0)
+               return -1;
+
+       ss->name = strdup(name);
+       if (!ss->name)
+               goto out_close;
+
+       ss->type = type;
+
+       return 0;
+out_close:
+       close(fd);
+       return -1;
+}
+
+bool symsrc__possibly_runtime(struct symsrc *ss __used)
+{
+       /* Assume all sym sources could be a runtime image. */
+       return true;
+}
+
+bool symsrc__has_symtab(struct symsrc *ss __used)
+{
+       return false;
+}
+
+void symsrc__destroy(struct symsrc *ss)
+{
+       free(ss->name);
+       close(ss->fd);
+}
+
+int dso__synthesize_plt_symbols(struct dso *dso __used,
+                               struct symsrc *ss __used,
+                               struct map *map __used,
+                               symbol_filter_t filter __used)
+{
+       return 0;
+}
+
+int dso__load_sym(struct dso *dso, struct map *map __used, struct symsrc *ss,
+                 struct symsrc *runtime_ss __used,
+                 symbol_filter_t filter __used, int kmodule __used)
+{
+       unsigned char *build_id[BUILD_ID_SIZE];
+
+       if (filename__read_build_id(ss->name, build_id, BUILD_ID_SIZE) > 0) {
+               dso__set_build_id(dso, build_id);
+               return 1;
+       }
+       return 0;
+}
+
+void symbol__elf_init(void)
+{
+}
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c

index 8b63b678e127ba5253477db6c65355e28ae49e66..753699a20bc857cdf5a677d9b40f353c85a00239 100644 (file)
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -15,8 +15,6 @@
  #include "symbol.h"
  #include "strlist.h"
  
-#include <libelf.h>
-#include <gelf.h>
  #include <elf.h>
  #include <limits.h>
  #include <sys/utsname.h>
@@ -25,15 +23,7 @@
  #define KSYM_NAME_LEN 256
  #endif
  
-#ifndef NT_GNU_BUILD_ID
-#define NT_GNU_BUILD_ID 3
-#endif
-
  static void dso_cache__free(struct rb_root *root);
-static bool dso__build_id_equal(const struct dso *dso, u8 *build_id);
-static int elf_read_build_id(Elf *elf, void *bf, size_t size);
-static void dsos__add(struct list_head *head, struct dso *dso);
-static struct map *map__new2(u64 start, struct dso *dso, enum map_type type);
  static int dso__load_kernel_sym(struct dso *dso, struct map *map,
                                 symbol_filter_t filter);
  static int dso__load_guest_kernel_sym(struct dso *dso, struct map *map,
@@ -170,7 +160,7 @@ static int choose_best_symbol(struct symbol *syma, struct symbol *symb)
                 return SYMBOL_B;
  }
  
-static void symbols__fixup_duplicate(struct rb_root *symbols)
+void symbols__fixup_duplicate(struct rb_root *symbols)
  {
         struct rb_node *nd;
         struct symbol *curr, *next;
@@ -199,7 +189,7 @@ again:
         }
  }
  
-static void symbols__fixup_end(struct rb_root *symbols)
+void symbols__fixup_end(struct rb_root *symbols)
  {
         struct rb_node *nd, *prevnd = rb_first(symbols);
         struct symbol *curr, *prev;
@@ -222,7 +212,7 @@ static void symbols__fixup_end(struct rb_root *symbols)
                 curr->end = roundup(curr->start, 4096);
  }
  
-static void __map_groups__fixup_end(struct map_groups *mg, enum map_type type)
+void __map_groups__fixup_end(struct map_groups *mg, enum map_type type)
  {
         struct map *prev, *curr;
         struct rb_node *nd, *prevnd = rb_first(&mg->maps[type]);
@@ -252,8 +242,7 @@ static void map_groups__fixup_end(struct map_groups *mg)
                 __map_groups__fixup_end(mg, i);
  }
  
-static struct symbol *symbol__new(u64 start, u64 len, u8 binding,
-                                 const char *name)
+struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name)
  {
         size_t namelen = strlen(name) + 1;
         struct symbol *sym = calloc(1, (symbol_conf.priv_size +
@@ -390,7 +379,7 @@ void dso__set_build_id(struct dso *dso, void *build_id)
         dso->has_build_id = 1;
  }
  
-static void symbols__insert(struct rb_root *symbols, struct symbol *sym)
+void symbols__insert(struct rb_root *symbols, struct symbol *sym)
  {
         struct rb_node **p = &symbols->rb_node;
         struct rb_node *parent = NULL;
@@ -574,7 +563,7 @@ size_t dso__fprintf(struct dso *dso, enum map_type type, FILE *fp)
  
  int kallsyms__parse(const char *filename, void *arg,
                     int (*process_symbol)(void *arg, const char *name,
-                                         char type, u64 start, u64 end))
+                                         char type, u64 start))
  {
         char *line = NULL;
         size_t n;
@@ -614,13 +603,8 @@ int kallsyms__parse(const char *filename, void *arg,
                         break;
                 }
  
-               /*
-                * module symbols are not sorted so we add all
-                * symbols with zero length and rely on
-                * symbols__fixup_end() to fix it up.
-                */
                 err = process_symbol(arg, symbol_name,
-                                    symbol_type, start, start);
+                                    symbol_type, start);
                 if (err)
                         break;
         }
@@ -647,7 +631,7 @@ static u8 kallsyms2elf_type(char type)
  }
  
  static int map__process_kallsym_symbol(void *arg, const char *name,
-                                      char type, u64 start, u64 end)
+                                      char type, u64 start)
  {
         struct symbol *sym;
         struct process_kallsyms_args *a = arg;
@@ -656,8 +640,12 @@ static int map__process_kallsym_symbol(void *arg, const char *name,
         if (!symbol_type__is_a(type, a->map->type))
                 return 0;
  
-       sym = symbol__new(start, end - start + 1,
-                         kallsyms2elf_type(type), name);
+       /*
+        * module symbols are not sorted so we add all
+        * symbols, setting length to 0, and rely on
+        * symbols__fixup_end() to fix it up.
+        */
+       sym = symbol__new(start, 0, kallsyms2elf_type(type), name);
         if (sym == NULL)
                 return -ENOMEM;
         /*
@@ -904,556 +892,7 @@ out_failure:
         return -1;
  }
  
-/**
- * elf_symtab__for_each_symbol - iterate thru all the symbols
- *
- * @syms: struct elf_symtab instance to iterate
- * @idx: uint32_t idx
- * @sym: GElf_Sym iterator
- */
-#define elf_symtab__for_each_symbol(syms, nr_syms, idx, sym) \
-       for (idx = 0, gelf_getsym(syms, idx, &sym);\
-            idx < nr_syms; \
-            idx++, gelf_getsym(syms, idx, &sym))
-
-static inline uint8_t elf_sym__type(const GElf_Sym *sym)
-{
-       return GELF_ST_TYPE(sym->st_info);
-}
-
-static inline int elf_sym__is_function(const GElf_Sym *sym)
-{
-       return elf_sym__type(sym) == STT_FUNC &&
-              sym->st_name != 0 &&
-              sym->st_shndx != SHN_UNDEF;
-}
-
-static inline bool elf_sym__is_object(const GElf_Sym *sym)
-{
-       return elf_sym__type(sym) == STT_OBJECT &&
-               sym->st_name != 0 &&
-               sym->st_shndx != SHN_UNDEF;
-}
-
-static inline int elf_sym__is_label(const GElf_Sym *sym)
-{
-       return elf_sym__type(sym) == STT_NOTYPE &&
-               sym->st_name != 0 &&
-               sym->st_shndx != SHN_UNDEF &&
-               sym->st_shndx != SHN_ABS;
-}
-
-static inline const char *elf_sec__name(const GElf_Shdr *shdr,
-                                       const Elf_Data *secstrs)
-{
-       return secstrs->d_buf + shdr->sh_name;
-}
-
-static inline int elf_sec__is_text(const GElf_Shdr *shdr,
-                                       const Elf_Data *secstrs)
-{
-       return strstr(elf_sec__name(shdr, secstrs), "text") != NULL;
-}
-
-static inline bool elf_sec__is_data(const GElf_Shdr *shdr,
-                                   const Elf_Data *secstrs)
-{
-       return strstr(elf_sec__name(shdr, secstrs), "data") != NULL;
-}
-
-static inline const char *elf_sym__name(const GElf_Sym *sym,
-                                       const Elf_Data *symstrs)
-{
-       return symstrs->d_buf + sym->st_name;
-}
-
-static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
-                                   GElf_Shdr *shp, const char *name,
-                                   size_t *idx)
-{
-       Elf_Scn *sec = NULL;
-       size_t cnt = 1;
-
-       while ((sec = elf_nextscn(elf, sec)) != NULL) {
-               char *str;
-
-               gelf_getshdr(sec, shp);
-               str = elf_strptr(elf, ep->e_shstrndx, shp->sh_name);
-               if (!strcmp(name, str)) {
-                       if (idx)
-                               *idx = cnt;
-                       break;
-               }
-               ++cnt;
-       }
-
-       return sec;
-}
-
-#define elf_section__for_each_rel(reldata, pos, pos_mem, idx, nr_entries) \
-       for (idx = 0, pos = gelf_getrel(reldata, 0, &pos_mem); \
-            idx < nr_entries; \
-            ++idx, pos = gelf_getrel(reldata, idx, &pos_mem))
-
-#define elf_section__for_each_rela(reldata, pos, pos_mem, idx, nr_entries) \
-       for (idx = 0, pos = gelf_getrela(reldata, 0, &pos_mem); \
-            idx < nr_entries; \
-            ++idx, pos = gelf_getrela(reldata, idx, &pos_mem))
-
-/*
- * We need to check if we have a .dynsym, so that we can handle the
- * .plt, synthesizing its symbols, that aren't on the symtabs (be it
- * .dynsym or .symtab).
- * And always look at the original dso, not at debuginfo packages, that
- * have the PLT data stripped out (shdr_rel_plt.sh_type == SHT_NOBITS).
- */
-static int
-dso__synthesize_plt_symbols(struct dso *dso, char *name, struct map *map,
-                           symbol_filter_t filter)
-{
-       uint32_t nr_rel_entries, idx;
-       GElf_Sym sym;
-       u64 plt_offset;
-       GElf_Shdr shdr_plt;
-       struct symbol *f;
-       GElf_Shdr shdr_rel_plt, shdr_dynsym;
-       Elf_Data *reldata, *syms, *symstrs;
-       Elf_Scn *scn_plt_rel, *scn_symstrs, *scn_dynsym;
-       size_t dynsym_idx;
-       GElf_Ehdr ehdr;
-       char sympltname[1024];
-       Elf *elf;
-       int nr = 0, symidx, fd, err = 0;
-
-       fd = open(name, O_RDONLY);
-       if (fd < 0)
-               goto out;
-
-       elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
-       if (elf == NULL)
-               goto out_close;
-
-       if (gelf_getehdr(elf, &ehdr) == NULL)
-               goto out_elf_end;
-
-       scn_dynsym = elf_section_by_name(elf, &ehdr, &shdr_dynsym,
-                                        ".dynsym", &dynsym_idx);
-       if (scn_dynsym == NULL)
-               goto out_elf_end;
-
-       scn_plt_rel = elf_section_by_name(elf, &ehdr, &shdr_rel_plt,
-                                         ".rela.plt", NULL);
-       if (scn_plt_rel == NULL) {
-               scn_plt_rel = elf_section_by_name(elf, &ehdr, &shdr_rel_plt,
-                                                 ".rel.plt", NULL);
-               if (scn_plt_rel == NULL)
-                       goto out_elf_end;
-       }
-
-       err = -1;
-
-       if (shdr_rel_plt.sh_link != dynsym_idx)
-               goto out_elf_end;
-
-       if (elf_section_by_name(elf, &ehdr, &shdr_plt, ".plt", NULL) == NULL)
-               goto out_elf_end;
-
-       /*
-        * Fetch the relocation section to find the idxes to the GOT
-        * and the symbols in the .dynsym they refer to.
-        */
-       reldata = elf_getdata(scn_plt_rel, NULL);
-       if (reldata == NULL)
-               goto out_elf_end;
-
-       syms = elf_getdata(scn_dynsym, NULL);
-       if (syms == NULL)
-               goto out_elf_end;
-
-       scn_symstrs = elf_getscn(elf, shdr_dynsym.sh_link);
-       if (scn_symstrs == NULL)
-               goto out_elf_end;
-
-       symstrs = elf_getdata(scn_symstrs, NULL);
-       if (symstrs == NULL)
-               goto out_elf_end;
-
-       nr_rel_entries = shdr_rel_plt.sh_size / shdr_rel_plt.sh_entsize;
-       plt_offset = shdr_plt.sh_offset;
-
-       if (shdr_rel_plt.sh_type == SHT_RELA) {
-               GElf_Rela pos_mem, *pos;
-
-               elf_section__for_each_rela(reldata, pos, pos_mem, idx,
-                                          nr_rel_entries) {
-                       symidx = GELF_R_SYM(pos->r_info);
-                       plt_offset += shdr_plt.sh_entsize;
-                       gelf_getsym(syms, symidx, &sym);
-                       snprintf(sympltname, sizeof(sympltname),
-                                "%s@plt", elf_sym__name(&sym, symstrs));
-
-                       f = symbol__new(plt_offset, shdr_plt.sh_entsize,
-                                       STB_GLOBAL, sympltname);
-                       if (!f)
-                               goto out_elf_end;
-
-                       if (filter && filter(map, f))
-                               symbol__delete(f);
-                       else {
-                               symbols__insert(&dso->symbols[map->type], f);
-                               ++nr;
-                       }
-               }
-       } else if (shdr_rel_plt.sh_type == SHT_REL) {
-               GElf_Rel pos_mem, *pos;
-               elf_section__for_each_rel(reldata, pos, pos_mem, idx,
-                                         nr_rel_entries) {
-                       symidx = GELF_R_SYM(pos->r_info);
-                       plt_offset += shdr_plt.sh_entsize;
-                       gelf_getsym(syms, symidx, &sym);
-                       snprintf(sympltname, sizeof(sympltname),
-                                "%s@plt", elf_sym__name(&sym, symstrs));
-
-                       f = symbol__new(plt_offset, shdr_plt.sh_entsize,
-                                       STB_GLOBAL, sympltname);
-                       if (!f)
-                               goto out_elf_end;
-
-                       if (filter && filter(map, f))
-                               symbol__delete(f);
-                       else {
-                               symbols__insert(&dso->symbols[map->type], f);
-                               ++nr;
-                       }
-               }
-       }
-
-       err = 0;
-out_elf_end:
-       elf_end(elf);
-out_close:
-       close(fd);
-
-       if (err == 0)
-               return nr;
-out:
-       pr_debug("%s: problems reading %s PLT info.\n",
-                __func__, dso->long_name);
-       return 0;
-}
-
-static bool elf_sym__is_a(GElf_Sym *sym, enum map_type type)
-{
-       switch (type) {
-       case MAP__FUNCTION:
-               return elf_sym__is_function(sym);
-       case MAP__VARIABLE:
-               return elf_sym__is_object(sym);
-       default:
-               return false;
-       }
-}
-
-static bool elf_sec__is_a(GElf_Shdr *shdr, Elf_Data *secstrs,
-                         enum map_type type)
-{
-       switch (type) {
-       case MAP__FUNCTION:
-               return elf_sec__is_text(shdr, secstrs);
-       case MAP__VARIABLE:
-               return elf_sec__is_data(shdr, secstrs);
-       default:
-               return false;
-       }
-}
-
-static size_t elf_addr_to_index(Elf *elf, GElf_Addr addr)
-{
-       Elf_Scn *sec = NULL;
-       GElf_Shdr shdr;
-       size_t cnt = 1;
-
-       while ((sec = elf_nextscn(elf, sec)) != NULL) {
-               gelf_getshdr(sec, &shdr);
-
-               if ((addr >= shdr.sh_addr) &&
-                   (addr < (shdr.sh_addr + shdr.sh_size)))
-                       return cnt;
-
-               ++cnt;
-       }
-
-       return -1;
-}
-
-static int dso__swap_init(struct dso *dso, unsigned char eidata)
-{
-       static unsigned int const endian = 1;
-
-       dso->needs_swap = DSO_SWAP__NO;
-
-       switch (eidata) {
-       case ELFDATA2LSB:
-               /* We are big endian, DSO is little endian. */
-               if (*(unsigned char const *)&endian != 1)
-                       dso->needs_swap = DSO_SWAP__YES;
-               break;
-
-       case ELFDATA2MSB:
-               /* We are little endian, DSO is big endian. */
-               if (*(unsigned char const *)&endian != 0)
-                       dso->needs_swap = DSO_SWAP__YES;
-               break;
-
-       default:
-               pr_err("unrecognized DSO data encoding %d\n", eidata);
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-static int dso__load_sym(struct dso *dso, struct map *map, const char *name,
-                        int fd, symbol_filter_t filter, int kmodule,
-                        int want_symtab)
-{
-       struct kmap *kmap = dso->kernel ? map__kmap(map) : NULL;
-       struct map *curr_map = map;
-       struct dso *curr_dso = dso;
-       Elf_Data *symstrs, *secstrs;
-       uint32_t nr_syms;
-       int err = -1;
-       uint32_t idx;
-       GElf_Ehdr ehdr;
-       GElf_Shdr shdr, opdshdr;
-       Elf_Data *syms, *opddata = NULL;
-       GElf_Sym sym;
-       Elf_Scn *sec, *sec_strndx, *opdsec;
-       Elf *elf;
-       int nr = 0;
-       size_t opdidx = 0;
-
-       elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
-       if (elf == NULL) {
-               pr_debug("%s: cannot read %s ELF file.\n", __func__, name);
-               goto out_close;
-       }
-
-       if (gelf_getehdr(elf, &ehdr) == NULL) {
-               pr_debug("%s: cannot get elf header.\n", __func__);
-               goto out_elf_end;
-       }
-
-       if (dso__swap_init(dso, ehdr.e_ident[EI_DATA]))
-               goto out_elf_end;
-
-       /* Always reject images with a mismatched build-id: */
-       if (dso->has_build_id) {
-               u8 build_id[BUILD_ID_SIZE];
-
-               if (elf_read_build_id(elf, build_id, BUILD_ID_SIZE) < 0)
-                       goto out_elf_end;
-
-               if (!dso__build_id_equal(dso, build_id))
-                       goto out_elf_end;
-       }
-
-       sec = elf_section_by_name(elf, &ehdr, &shdr, ".symtab", NULL);
-       if (sec == NULL) {
-               if (want_symtab)
-                       goto out_elf_end;
-
-               sec = elf_section_by_name(elf, &ehdr, &shdr, ".dynsym", NULL);
-               if (sec == NULL)
-                       goto out_elf_end;
-       }
-
-       opdsec = elf_section_by_name(elf, &ehdr, &opdshdr, ".opd", &opdidx);
-       if (opdshdr.sh_type != SHT_PROGBITS)
-               opdsec = NULL;
-       if (opdsec)
-               opddata = elf_rawdata(opdsec, NULL);
-
-       syms = elf_getdata(sec, NULL);
-       if (syms == NULL)
-               goto out_elf_end;
-
-       sec = elf_getscn(elf, shdr.sh_link);
-       if (sec == NULL)
-               goto out_elf_end;
-
-       symstrs = elf_getdata(sec, NULL);
-       if (symstrs == NULL)
-               goto out_elf_end;
-
-       sec_strndx = elf_getscn(elf, ehdr.e_shstrndx);
-       if (sec_strndx == NULL)
-               goto out_elf_end;
-
-       secstrs = elf_getdata(sec_strndx, NULL);
-       if (secstrs == NULL)
-               goto out_elf_end;
-
-       nr_syms = shdr.sh_size / shdr.sh_entsize;
-
-       memset(&sym, 0, sizeof(sym));
-       if (dso->kernel == DSO_TYPE_USER) {
-               dso->adjust_symbols = (ehdr.e_type == ET_EXEC ||
-                               elf_section_by_name(elf, &ehdr, &shdr,
-                                                    ".gnu.prelink_undo",
-                                                    NULL) != NULL);
-       } else {
-               dso->adjust_symbols = 0;
-       }
-       elf_symtab__for_each_symbol(syms, nr_syms, idx, sym) {
-               struct symbol *f;
-               const char *elf_name = elf_sym__name(&sym, symstrs);
-               char *demangled = NULL;
-               int is_label = elf_sym__is_label(&sym);
-               const char *section_name;
-
-               if (kmap && kmap->ref_reloc_sym && kmap->ref_reloc_sym->name &&
-                   strcmp(elf_name, kmap->ref_reloc_sym->name) == 0)
-                       kmap->ref_reloc_sym->unrelocated_addr = sym.st_value;
-
-               if (!is_label && !elf_sym__is_a(&sym, map->type))
-                       continue;
-
-               /* Reject ARM ELF "mapping symbols": these aren't unique and
-                * don't identify functions, so will confuse the profile
-                * output: */
-               if (ehdr.e_machine == EM_ARM) {
-                       if (!strcmp(elf_name, "$a") ||
-                           !strcmp(elf_name, "$d") ||
-                           !strcmp(elf_name, "$t"))
-                               continue;
-               }
-
-               if (opdsec && sym.st_shndx == opdidx) {
-                       u32 offset = sym.st_value - opdshdr.sh_addr;
-                       u64 *opd = opddata->d_buf + offset;
-                       sym.st_value = DSO__SWAP(dso, u64, *opd);
-                       sym.st_shndx = elf_addr_to_index(elf, sym.st_value);
-               }
-
-               sec = elf_getscn(elf, sym.st_shndx);
-               if (!sec)
-                       goto out_elf_end;
-
-               gelf_getshdr(sec, &shdr);
-
-               if (is_label && !elf_sec__is_a(&shdr, secstrs, map->type))
-                       continue;
-
-               section_name = elf_sec__name(&shdr, secstrs);
-
-               /* On ARM, symbols for thumb functions have 1 added to
-                * the symbol address as a flag - remove it */
-               if ((ehdr.e_machine == EM_ARM) &&
-                   (map->type == MAP__FUNCTION) &&
-                   (sym.st_value & 1))
-                       --sym.st_value;
-
-               if (dso->kernel != DSO_TYPE_USER || kmodule) {
-                       char dso_name[PATH_MAX];
-
-                       if (strcmp(section_name,
-                                  (curr_dso->short_name +
-                                   dso->short_name_len)) == 0)
-                               goto new_symbol;
-
-                       if (strcmp(section_name, ".text") == 0) {
-                               curr_map = map;
-                               curr_dso = dso;
-                               goto new_symbol;
-                       }
-
-                       snprintf(dso_name, sizeof(dso_name),
-                                "%s%s", dso->short_name, section_name);
-
-                       curr_map = map_groups__find_by_name(kmap->kmaps, map->type, dso_name);
-                       if (curr_map == NULL) {
-                               u64 start = sym.st_value;
-
-                               if (kmodule)
-                                       start += map->start + shdr.sh_offset;
-
-                               curr_dso = dso__new(dso_name);
-                               if (curr_dso == NULL)
-                                       goto out_elf_end;
-                               curr_dso->kernel = dso->kernel;
-                               curr_dso->long_name = dso->long_name;
-                               curr_dso->long_name_len = dso->long_name_len;
-                               curr_map = map__new2(start, curr_dso,
-                                                    map->type);
-                               if (curr_map == NULL) {
-                                       dso__delete(curr_dso);
-                                       goto out_elf_end;
-                               }
-                               curr_map->map_ip = identity__map_ip;
-                               curr_map->unmap_ip = identity__map_ip;
-                               curr_dso->symtab_type = dso->symtab_type;
-                               map_groups__insert(kmap->kmaps, curr_map);
-                               dsos__add(&dso->node, curr_dso);
-                               dso__set_loaded(curr_dso, map->type);
-                       } else
-                               curr_dso = curr_map->dso;
-
-                       goto new_symbol;
-               }
-
-               if (curr_dso->adjust_symbols) {
-                       pr_debug4("%s: adjusting symbol: st_value: %#" PRIx64 " "
-                                 "sh_addr: %#" PRIx64 " sh_offset: %#" PRIx64 "\n", __func__,
-                                 (u64)sym.st_value, (u64)shdr.sh_addr,
-                                 (u64)shdr.sh_offset);
-                       sym.st_value -= shdr.sh_addr - shdr.sh_offset;
-               }
-               /*
-                * We need to figure out if the object was created from C++ sources
-                * DWARF DW_compile_unit has this, but we don't always have access
-                * to it...
-                */
-               demangled = bfd_demangle(NULL, elf_name, DMGL_PARAMS | DMGL_ANSI);
-               if (demangled != NULL)
-                       elf_name = demangled;
-new_symbol:
-               f = symbol__new(sym.st_value, sym.st_size,
-                               GELF_ST_BIND(sym.st_info), elf_name);
-               free(demangled);
-               if (!f)
-                       goto out_elf_end;
-
-               if (filter && filter(curr_map, f))
-                       symbol__delete(f);
-               else {
-                       symbols__insert(&curr_dso->symbols[curr_map->type], f);
-                       nr++;
-               }
-       }
-
-       /*
-        * For misannotated, zeroed, ASM function sizes.
-        */
-       if (nr > 0) {
-               symbols__fixup_duplicate(&dso->symbols[map->type]);
-               symbols__fixup_end(&dso->symbols[map->type]);
-               if (kmap) {
-                       /*
-                        * We need to fixup this here too because we create new
-                        * maps here, for things like vsyscall sections.
-                        */
-                       __map_groups__fixup_end(kmap->kmaps, map->type);
-               }
-       }
-       err = nr;
-out_elf_end:
-       elf_end(elf);
-out_close:
-       return err;
-}
-
-static bool dso__build_id_equal(const struct dso *dso, u8 *build_id)
+bool dso__build_id_equal(const struct dso *dso, u8 *build_id)
  {
         return memcmp(dso->build_id, build_id, sizeof(dso->build_id)) == 0;
  }
@@ -1480,216 +919,11 @@ bool __dsos__read_build_ids(struct list_head *head, bool with_hits)
         return have_build_id;
  }
  
-/*
- * Align offset to 4 bytes as needed for note name and descriptor data.
- */
-#define NOTE_ALIGN(n) (((n) + 3) & -4U)
-
-static int elf_read_build_id(Elf *elf, void *bf, size_t size)
-{
-       int err = -1;
-       GElf_Ehdr ehdr;
-       GElf_Shdr shdr;
-       Elf_Data *data;
-       Elf_Scn *sec;
-       Elf_Kind ek;
-       void *ptr;
-
-       if (size < BUILD_ID_SIZE)
-               goto out;
-
-       ek = elf_kind(elf);
-       if (ek != ELF_K_ELF)
-               goto out;
-
-       if (gelf_getehdr(elf, &ehdr) == NULL) {
-               pr_err("%s: cannot get elf header.\n", __func__);
-               goto out;
-       }
-
-       /*
-        * Check following sections for notes:
-        *   '.note.gnu.build-id'
-        *   '.notes'
-        *   '.note' (VDSO specific)
-        */
-       do {
-               sec = elf_section_by_name(elf, &ehdr, &shdr,
-                                         ".note.gnu.build-id", NULL);
-               if (sec)
-                       break;
-
-               sec = elf_section_by_name(elf, &ehdr, &shdr,
-                                         ".notes", NULL);
-               if (sec)
-                       break;
-
-               sec = elf_section_by_name(elf, &ehdr, &shdr,
-                                         ".note", NULL);
-               if (sec)
-                       break;
-
-               return err;
-
-       } while (0);
-
-       data = elf_getdata(sec, NULL);
-       if (data == NULL)
-               goto out;
-
-       ptr = data->d_buf;
-       while (ptr < (data->d_buf + data->d_size)) {
-               GElf_Nhdr *nhdr = ptr;
-               size_t namesz = NOTE_ALIGN(nhdr->n_namesz),
-                      descsz = NOTE_ALIGN(nhdr->n_descsz);
-               const char *name;
-
-               ptr += sizeof(*nhdr);
-               name = ptr;
-               ptr += namesz;
-               if (nhdr->n_type == NT_GNU_BUILD_ID &&
-                   nhdr->n_namesz == sizeof("GNU")) {
-                       if (memcmp(name, "GNU", sizeof("GNU")) == 0) {
-                               size_t sz = min(size, descsz);
-                               memcpy(bf, ptr, sz);
-                               memset(bf + sz, 0, size - sz);
-                               err = descsz;
-                               break;
-                       }
-               }
-               ptr += descsz;
-       }
-
-out:
-       return err;
-}
-
-int filename__read_build_id(const char *filename, void *bf, size_t size)
-{
-       int fd, err = -1;
-       Elf *elf;
-
-       if (size < BUILD_ID_SIZE)
-               goto out;
-
-       fd = open(filename, O_RDONLY);
-       if (fd < 0)
-               goto out;
-
-       elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
-       if (elf == NULL) {
-               pr_debug2("%s: cannot read %s ELF file.\n", __func__, filename);
-               goto out_close;
-       }
-
-       err = elf_read_build_id(elf, bf, size);
-
-       elf_end(elf);
-out_close:
-       close(fd);
-out:
-       return err;
-}
-
-int sysfs__read_build_id(const char *filename, void *build_id, size_t size)
-{
-       int fd, err = -1;
-
-       if (size < BUILD_ID_SIZE)
-               goto out;
-
-       fd = open(filename, O_RDONLY);
-       if (fd < 0)
-               goto out;
-
-       while (1) {
-               char bf[BUFSIZ];
-               GElf_Nhdr nhdr;
-               size_t namesz, descsz;
-
-               if (read(fd, &nhdr, sizeof(nhdr)) != sizeof(nhdr))
-                       break;
-
-               namesz = NOTE_ALIGN(nhdr.n_namesz);
-               descsz = NOTE_ALIGN(nhdr.n_descsz);
-               if (nhdr.n_type == NT_GNU_BUILD_ID &&
-                   nhdr.n_namesz == sizeof("GNU")) {
-                       if (read(fd, bf, namesz) != (ssize_t)namesz)
-                               break;
-                       if (memcmp(bf, "GNU", sizeof("GNU")) == 0) {
-                               size_t sz = min(descsz, size);
-                               if (read(fd, build_id, sz) == (ssize_t)sz) {
-                                       memset(build_id + sz, 0, size - sz);
-                                       err = 0;
-                                       break;
-                               }
-                       } else if (read(fd, bf, descsz) != (ssize_t)descsz)
-                               break;
-               } else {
-                       int n = namesz + descsz;
-                       if (read(fd, bf, n) != n)
-                               break;
-               }
-       }
-       close(fd);
-out:
-       return err;
-}
-
-static int filename__read_debuglink(const char *filename,
-                                   char *debuglink, size_t size)
-{
-       int fd, err = -1;
-       Elf *elf;
-       GElf_Ehdr ehdr;
-       GElf_Shdr shdr;
-       Elf_Data *data;
-       Elf_Scn *sec;
-       Elf_Kind ek;
-
-       fd = open(filename, O_RDONLY);
-       if (fd < 0)
-               goto out;
-
-       elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
-       if (elf == NULL) {
-               pr_debug2("%s: cannot read %s ELF file.\n", __func__, filename);
-               goto out_close;
-       }
-
-       ek = elf_kind(elf);
-       if (ek != ELF_K_ELF)
-               goto out_close;
-
-       if (gelf_getehdr(elf, &ehdr) == NULL) {
-               pr_err("%s: cannot get elf header.\n", __func__);
-               goto out_close;
-       }
-
-       sec = elf_section_by_name(elf, &ehdr, &shdr,
-                                 ".gnu_debuglink", NULL);
-       if (sec == NULL)
-               goto out_close;
-
-       data = elf_getdata(sec, NULL);
-       if (data == NULL)
-               goto out_close;
-
-       /* the start of this section is a zero-terminated string */
-       strncpy(debuglink, data->d_buf, size);
-
-       elf_end(elf);
-
-out_close:
-       close(fd);
-out:
-       return err;
-}
-
  char dso__symtab_origin(const struct dso *dso)
  {
         static const char origin[] = {
                 [DSO_BINARY_TYPE__KALLSYMS]             = 'k',
+               [DSO_BINARY_TYPE__VMLINUX]              = 'v',
                 [DSO_BINARY_TYPE__JAVA_JIT]             = 'j',
                 [DSO_BINARY_TYPE__DEBUGLINK]            = 'l',
                 [DSO_BINARY_TYPE__BUILD_ID_CACHE]       = 'B',
@@ -1700,6 +934,7 @@ char dso__symtab_origin(const struct dso *dso)
                 [DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE]  = 'K',
                 [DSO_BINARY_TYPE__GUEST_KALLSYMS]       = 'g',
                 [DSO_BINARY_TYPE__GUEST_KMODULE]        = 'G',
+               [DSO_BINARY_TYPE__GUEST_VMLINUX]        = 'V',
         };
  
         if (dso == NULL || dso->symtab_type == DSO_BINARY_TYPE__NOT_FOUND)
@@ -1775,7 +1010,9 @@ int dso__binary_type_file(struct dso *dso, enum dso_binary_type type,
  
         default:
         case DSO_BINARY_TYPE__KALLSYMS:
+       case DSO_BINARY_TYPE__VMLINUX:
         case DSO_BINARY_TYPE__GUEST_KALLSYMS:
+       case DSO_BINARY_TYPE__GUEST_VMLINUX:
         case DSO_BINARY_TYPE__JAVA_JIT:
         case DSO_BINARY_TYPE__NOT_FOUND:
                 ret = -1;
@@ -1789,11 +1026,12 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
  {
         char *name;
         int ret = -1;
-       int fd;
         u_int i;
         struct machine *machine;
         char *root_dir = (char *) "";
-       int want_symtab;
+       int ss_pos = 0;
+       struct symsrc ss_[2];
+       struct symsrc *syms_ss = NULL, *runtime_ss = NULL;
  
         dso__set_loaded(dso, map->type);
  
@@ -1835,54 +1073,69 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
                 root_dir = machine->root_dir;
  
         /* Iterate over candidate debug images.
-        * On the first pass, only load images if they have a full symtab.
-        * Failing that, do a second pass where we accept .dynsym also
+        * Keep track of "interesting" ones (those which have a symtab, dynsym,
+        * and/or opd section) for processing.
          */
-       want_symtab = 1;
-restart:
         for (i = 0; i < DSO_BINARY_TYPE__SYMTAB_CNT; i++) {
+               struct symsrc *ss = &ss_[ss_pos];
+               bool next_slot = false;
  
-               dso->symtab_type = binary_type_symtab[i];
+               enum dso_binary_type symtab_type = binary_type_symtab[i];
  
-               if (dso__binary_type_file(dso, dso->symtab_type,
+               if (dso__binary_type_file(dso, symtab_type,
                                           root_dir, name, PATH_MAX))
                         continue;
  
                 /* Name is now the name of the next image to try */
-               fd = open(name, O_RDONLY);
-               if (fd < 0)
+               if (symsrc__init(ss, dso, name, symtab_type) < 0)
                         continue;
  
-               ret = dso__load_sym(dso, map, name, fd, filter, 0,
-                                   want_symtab);
-               close(fd);
+               if (!syms_ss && symsrc__has_symtab(ss)) {
+                       syms_ss = ss;
+                       next_slot = true;
+               }
  
-               /*
-                * Some people seem to have debuginfo files _WITHOUT_ debug
-                * info!?!?
-                */
-               if (!ret)
-                       continue;
+               if (!runtime_ss && symsrc__possibly_runtime(ss)) {
+                       runtime_ss = ss;
+                       next_slot = true;
+               }
  
-               if (ret > 0) {
-                       int nr_plt;
+               if (next_slot) {
+                       ss_pos++;
  
-                       nr_plt = dso__synthesize_plt_symbols(dso, name, map, filter);
-                       if (nr_plt > 0)
-                               ret += nr_plt;
-                       break;
+                       if (syms_ss && runtime_ss)
+                               break;
                 }
+
         }
  
-       /*
-        * If we wanted a full symtab but no image had one,
-        * relax our requirements and repeat the search.
-        */
-       if (ret <= 0 && want_symtab) {
-               want_symtab = 0;
-               goto restart;
+       if (!runtime_ss && !syms_ss)
+               goto out_free;
+
+       if (runtime_ss && !syms_ss) {
+               syms_ss = runtime_ss;
+       }
+
+       /* We'll have to hope for the best */
+       if (!runtime_ss && syms_ss)
+               runtime_ss = syms_ss;
+
+       if (syms_ss)
+               ret = dso__load_sym(dso, map, syms_ss, runtime_ss, filter, 0);
+       else
+               ret = -1;
+
+       if (ret > 0) {
+               int nr_plt;
+
+               nr_plt = dso__synthesize_plt_symbols(dso, runtime_ss, map, filter);
+               if (nr_plt > 0)
+                       ret += nr_plt;
         }
  
+       for (; ss_pos > 0; ss_pos--)
+               symsrc__destroy(&ss_[ss_pos - 1]);
+out_free:
         free(name);
         if (ret < 0 && strstr(dso->name, " (deleted)") != NULL)
                 return 0;
@@ -2030,25 +1283,6 @@ static int machine__set_modules_path(struct machine *machine)
         return map_groups__set_modules_path_dir(&machine->kmaps, modules_path);
  }
  
-/*
- * Constructor variant for modules (where we know from /proc/modules where
- * they are loaded) and for vmlinux, where only after we load all the
- * symbols we'll know where it starts and ends.
- */
-static struct map *map__new2(u64 start, struct dso *dso, enum map_type type)
-{
-       struct map *map = calloc(1, (sizeof(*map) +
-                                    (dso->kernel ? sizeof(struct kmap) : 0)));
-       if (map != NULL) {
-               /*
-                * ->end will be filled after we load all the symbols
-                */
-               map__init(map, type, start, 0, 0, dso);
-       }
-
-       return map;
-}
-
  struct map *machine__new_module(struct machine *machine, u64 start,
                                 const char *filename)
  {
@@ -2141,22 +1375,30 @@ out_failure:
  int dso__load_vmlinux(struct dso *dso, struct map *map,
                       const char *vmlinux, symbol_filter_t filter)
  {
-       int err = -1, fd;
+       int err = -1;
+       struct symsrc ss;
         char symfs_vmlinux[PATH_MAX];
+       enum dso_binary_type symtab_type;
  
         snprintf(symfs_vmlinux, sizeof(symfs_vmlinux), "%s%s",
                  symbol_conf.symfs, vmlinux);
-       fd = open(symfs_vmlinux, O_RDONLY);
-       if (fd < 0)
+
+       if (dso->kernel == DSO_TYPE_GUEST_KERNEL)
+               symtab_type = DSO_BINARY_TYPE__GUEST_VMLINUX;
+       else
+               symtab_type = DSO_BINARY_TYPE__VMLINUX;
+
+       if (symsrc__init(&ss, dso, symfs_vmlinux, symtab_type))
                 return -1;
  
-       dso__set_long_name(dso, (char *)vmlinux);
-       dso__set_loaded(dso, map->type);
-       err = dso__load_sym(dso, map, symfs_vmlinux, fd, filter, 0, 0);
-       close(fd);
+       err = dso__load_sym(dso, map, &ss, &ss, filter, 0);
+       symsrc__destroy(&ss);
  
-       if (err > 0)
+       if (err > 0) {
+               dso__set_long_name(dso, (char *)vmlinux);
+               dso__set_loaded(dso, map->type);
                 pr_debug("Using %s for symbols\n", symfs_vmlinux);
+       }
  
         return err;
  }
@@ -2173,10 +1415,8 @@ int dso__load_vmlinux_path(struct dso *dso, struct map *map,
         filename = dso__build_id_filename(dso, NULL, 0);
         if (filename != NULL) {
                 err = dso__load_vmlinux(dso, map, filename, filter);
-               if (err > 0) {
-                       dso__set_long_name(dso, filename);
+               if (err > 0)
                         goto out;
-               }
                 free(filename);
         }
  
@@ -2291,9 +1531,8 @@ do_kallsyms:
         free(kallsyms_allocated_filename);
  
         if (err > 0) {
+               dso__set_long_name(dso, strdup("[kernel.kallsyms]"));
  out_fixup:
-               if (kallsyms_filename != NULL)
-                       dso__set_long_name(dso, strdup("[kernel.kallsyms]"));
                 map__fixup_start(map);
                 map__fixup_end(map);
         }
@@ -2352,7 +1591,7 @@ out_try_fixup:
         return err;
  }
  
-static void dsos__add(struct list_head *head, struct dso *dso)
+void dsos__add(struct list_head *head, struct dso *dso)
  {
         list_add_tail(&dso->node, head);
  }
@@ -2516,7 +1755,7 @@ struct process_args {
  };
  
  static int symbol__in_kernel(void *arg, const char *name,
-                            char type __used, u64 start, u64 end __used)
+                            char type __used, u64 start)
  {
         struct process_args *args = arg;
  
@@ -2754,7 +1993,8 @@ int symbol__init(void)
  
         symbol_conf.priv_size = ALIGN(symbol_conf.priv_size, sizeof(u64));
  
-       elf_version(EV_CURRENT);
+       symbol__elf_init();
+
         if (symbol_conf.sort_by_name)
                 symbol_conf.priv_size += (sizeof(struct symbol_name_rb_node) -
                                           sizeof(struct symbol));
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h

index 1fe733a1e21f2acb31282c9df562b34b0a03346f..fc4b1e630fd9cc941e0367eea2fa829ec85e0b12 100644 (file)
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -11,6 +11,12 @@
  #include <stdio.h>
  #include <byteswap.h>
  
+#ifndef NO_LIBELF_SUPPORT
+#include <libelf.h>
+#include <gelf.h>
+#include <elf.h>
+#endif
+
  #ifdef HAVE_CPLUS_DEMANGLE
  extern char *cplus_demangle(const char *, int);
  
@@ -158,6 +164,8 @@ struct addr_location {
  enum dso_binary_type {
         DSO_BINARY_TYPE__KALLSYMS = 0,
         DSO_BINARY_TYPE__GUEST_KALLSYMS,
+       DSO_BINARY_TYPE__VMLINUX,
+       DSO_BINARY_TYPE__GUEST_VMLINUX,
         DSO_BINARY_TYPE__JAVA_JIT,
         DSO_BINARY_TYPE__DEBUGLINK,
         DSO_BINARY_TYPE__BUILD_ID_CACHE,
@@ -217,6 +225,36 @@ struct dso {
         char             name[0];
  };
  
+struct symsrc {
+       char *name;
+       int fd;
+       enum dso_binary_type type;
+
+#ifndef NO_LIBELF_SUPPORT
+       Elf *elf;
+       GElf_Ehdr ehdr;
+
+       Elf_Scn *opdsec;
+       size_t opdidx;
+       GElf_Shdr opdshdr;
+
+       Elf_Scn *symtab;
+       GElf_Shdr symshdr;
+
+       Elf_Scn *dynsym;
+       size_t dynsym_idx;
+       GElf_Shdr dynshdr;
+
+       bool adjust_symbols;
+#endif
+};
+
+void symsrc__destroy(struct symsrc *ss);
+int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
+                enum dso_binary_type type);
+bool symsrc__has_symtab(struct symsrc *ss);
+bool symsrc__possibly_runtime(struct symsrc *ss);
+
  #define DSO__SWAP(dso, type, val)                      \
  ({                                                     \
         type ____r = val;                               \
@@ -254,6 +292,7 @@ static inline void dso__set_loaded(struct dso *dso, enum map_type type)
  
  void dso__sort_by_name(struct dso *dso, enum map_type type);
  
+void dsos__add(struct list_head *head, struct dso *dso);
  struct dso *__dsos__findnew(struct list_head *head, const char *name);
  
  int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter);
@@ -283,6 +322,7 @@ size_t dso__fprintf(struct dso *dso, enum map_type type, FILE *fp);
  char dso__symtab_origin(const struct dso *dso);
  void dso__set_long_name(struct dso *dso, char *name);
  void dso__set_build_id(struct dso *dso, void *build_id);
+bool dso__build_id_equal(const struct dso *dso, u8 *build_id);
  void dso__read_running_kernel_build_id(struct dso *dso,
                                        struct machine *machine);
  struct map *dso__new_map(const char *name);
@@ -297,7 +337,9 @@ bool __dsos__read_build_ids(struct list_head *head, bool with_hits);
  int build_id__sprintf(const u8 *build_id, int len, char *bf);
  int kallsyms__parse(const char *filename, void *arg,
                     int (*process_symbol)(void *arg, const char *name,
-                                         char type, u64 start, u64 end));
+                                         char type, u64 start));
+int filename__read_debuglink(const char *filename, char *debuglink,
+                            size_t size);
  
  void machine__destroy_kernel_maps(struct machine *machine);
  int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel);
@@ -309,6 +351,8 @@ void machines__destroy_guest_kernel_maps(struct rb_root *machines);
  
  int symbol__init(void);
  void symbol__exit(void);
+void symbol__elf_init(void);
+struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name);
  size_t symbol__fprintf_symname_offs(const struct symbol *sym,
                                     const struct addr_location *al, FILE *fp);
  size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp);
@@ -326,4 +370,15 @@ ssize_t dso__data_read_addr(struct dso *dso, struct map *map,
                             struct machine *machine, u64 addr,
                             u8 *data, ssize_t size);
  int dso__test_data(void);
+int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss,
+                 struct symsrc *runtime_ss, symbol_filter_t filter,
+                 int kmodule);
+int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss,
+                               struct map *map, symbol_filter_t filter);
+
+void symbols__insert(struct rb_root *symbols, struct symbol *sym);
+void symbols__fixup_duplicate(struct rb_root *symbols);
+void symbols__fixup_end(struct rb_root *symbols);
+void __map_groups__fixup_end(struct map_groups *mg, enum map_type type);
+
  #endif /* __PERF_SYMBOL */
diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c

index 7eeebcee291c4c70a8e0bc0dc0371d107080a1d9..884dde9b9bc1334294bc42d7d55bea65ba23f364 100644 (file)
--- a/tools/perf/util/top.c
+++ b/tools/perf/util/top.c
@@ -58,8 +58,7 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size)
         }
  
         if (top->evlist->nr_entries == 1) {
-               struct perf_evsel *first;
-               first = list_entry(top->evlist->entries.next, struct perf_evsel, node);
+               struct perf_evsel *first = perf_evlist__first(top->evlist);
                 ret += SNPRINTF(bf + ret, size - ret, "%" PRIu64 "%s ",
                                 (uint64_t)first->attr.sample_period,
                                 top->freq ? "Hz" : "");
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c

index 0715c843c2e73ad6fc61832854cb33571732a73f..a5a554efeb503cd8e0d97ad6f5919043d8efdadb 100644 (file)
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -162,25 +162,16 @@ int trace_parse_common_pid(struct pevent *pevent, void *data)
         return pevent_data_pid(pevent, &record);
  }
  
-unsigned long long read_size(struct pevent *pevent, void *ptr, int size)
+unsigned long long read_size(struct event_format *event, void *ptr, int size)
  {
-       return pevent_read_number(pevent, ptr, size);
+       return pevent_read_number(event->pevent, ptr, size);
  }
  
-void print_trace_event(struct pevent *pevent, int cpu, void *data, int size)
+void event_format__print(struct event_format *event,
+                        int cpu, void *data, int size)
  {
-       struct event_format *event;
         struct pevent_record record;
         struct trace_seq s;
-       int type;
-
-       type = trace_parse_common_type(pevent, data);
-
-       event = pevent_find_event(pevent, type);
-       if (!event) {
-               warning("ug! no event found for type %d", type);
-               return;
-       }
  
         memset(&record, 0, sizeof(record));
         record.cpu = cpu;
@@ -192,6 +183,19 @@ void print_trace_event(struct pevent *pevent, int cpu, void *data, int size)
         trace_seq_do_printf(&s);
  }
  
+void print_trace_event(struct pevent *pevent, int cpu, void *data, int size)
+{
+       int type = trace_parse_common_type(pevent, data);
+       struct event_format *event = pevent_find_event(pevent, type);
+
+       if (!event) {
+               warning("ug! no event found for type %d", type);
+               return;
+       }
+
+       event_format__print(event, cpu, data, size);
+}
+
  void print_event(struct pevent *pevent, int cpu, void *data, int size,
                  unsigned long long nsecs, char *comm)
  {
@@ -289,7 +293,7 @@ struct event_format *trace_find_next_event(struct pevent *pevent,
  {
         static int idx;
  
-       if (!pevent->events)
+       if (!pevent || !pevent->events)
                 return NULL;
  
         if (!event) {
diff --git a/tools/perf/util/trace-event-scripting.c b/tools/perf/util/trace-event-scripting.c

index 474aa7a7df43e178717c7808c2ee279c4d026fb2..302ff262494c740fcf5830e241a550444892f485 100644 (file)
--- a/tools/perf/util/trace-event-scripting.c
+++ b/tools/perf/util/trace-event-scripting.c
@@ -36,11 +36,10 @@ static int stop_script_unsupported(void)
  }
  
  static void process_event_unsupported(union perf_event *event __unused,
-                                     struct pevent *pevent __unused,
                                       struct perf_sample *sample __unused,
                                       struct perf_evsel *evsel __unused,
                                       struct machine *machine __unused,
-                                     struct thread *thread __unused)
+                                     struct addr_location *al __unused)
  {
  }
  
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h

index 8fef1d6687b73250a24e682b4c41a9fb29384421..a55fd37ffea1e50600f8af68797ded8411686128 100644 (file)
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -9,7 +9,6 @@ struct machine;
  struct perf_sample;
  union perf_event;
  struct perf_tool;
-struct thread;
  
  extern int header_page_size_size;
  extern int header_page_ts_size;
@@ -32,6 +31,8 @@ int bigendian(void);
  
  struct pevent *read_trace_init(int file_bigendian, int host_bigendian);
  void print_trace_event(struct pevent *pevent, int cpu, void *data, int size);
+void event_format__print(struct event_format *event,
+                        int cpu, void *data, int size);
  
  void print_event(struct pevent *pevent, int cpu, void *data, int size,
                  unsigned long long nsecs, char *comm);
@@ -56,7 +57,7 @@ int trace_parse_common_pid(struct pevent *pevent, void *data);
  
  struct event_format *trace_find_next_event(struct pevent *pevent,
                                            struct event_format *event);
-unsigned long long read_size(struct pevent *pevent, void *ptr, int size);
+unsigned long long read_size(struct event_format *event, void *ptr, int size);
  unsigned long long eval_flag(const char *flag);
  
  struct pevent_record *trace_read_data(struct pevent *pevent, int cpu);
@@ -74,16 +75,19 @@ struct tracing_data *tracing_data_get(struct list_head *pattrs,
  void tracing_data_put(struct tracing_data *tdata);
  
  
+struct addr_location;
+
+struct perf_session;
+
  struct scripting_ops {
         const char *name;
         int (*start_script) (const char *script, int argc, const char **argv);
         int (*stop_script) (void);
         void (*process_event) (union perf_event *event,
-                              struct pevent *pevent,
                                struct perf_sample *sample,
                                struct perf_evsel *evsel,
                                struct machine *machine,
-                              struct thread *thread);
+                              struct addr_location *al);
         int (*generate_script) (struct pevent *pevent, const char *outfile);
  };
  
diff --git a/tools/perf/util/unwind.c b/tools/perf/util/unwind.c

new file mode 100644 (file)

index 0000000..00a42aa
--- /dev/null
+++ b/tools/perf/util/unwind.c
@@ -0,0 +1,567 @@
+/*
+ * Post mortem Dwarf CFI based unwinding on top of regs and stack dumps.
+ *
+ * Lots of this code have been borrowed or heavily inspired from parts of
+ * the libunwind 0.99 code which are (amongst other contributors I may have
+ * forgotten):
+ *
+ * Copyright (C) 2002-2007 Hewlett-Packard Co
+ *     Contributed by David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * And the bugs have been added by:
+ *
+ * Copyright (C) 2010, Frederic Weisbecker <fweisbec@gmail.com>
+ * Copyright (C) 2012, Jiri Olsa <jolsa@redhat.com>
+ *
+ */
+
+#include <elf.h>
+#include <gelf.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <linux/list.h>
+#include <libunwind.h>
+#include <libunwind-ptrace.h>
+#include "thread.h"
+#include "session.h"
+#include "perf_regs.h"
+#include "unwind.h"
+#include "util.h"
+
+extern int
+UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
+                                   unw_word_t ip,
+                                   unw_dyn_info_t *di,
+                                   unw_proc_info_t *pi,
+                                   int need_unwind_info, void *arg);
+
+#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
+
+#define DW_EH_PE_FORMAT_MASK   0x0f    /* format of the encoded value */
+#define DW_EH_PE_APPL_MASK     0x70    /* how the value is to be applied */
+
+/* Pointer-encoding formats: */
+#define DW_EH_PE_omit          0xff
+#define DW_EH_PE_ptr           0x00    /* pointer-sized unsigned value */
+#define DW_EH_PE_udata4                0x03    /* unsigned 32-bit value */
+#define DW_EH_PE_udata8                0x04    /* unsigned 64-bit value */
+#define DW_EH_PE_sdata4                0x0b    /* signed 32-bit value */
+#define DW_EH_PE_sdata8                0x0c    /* signed 64-bit value */
+
+/* Pointer-encoding application: */
+#define DW_EH_PE_absptr                0x00    /* absolute value */
+#define DW_EH_PE_pcrel         0x10    /* rel. to addr. of encoded value */
+
+/*
+ * The following are not documented by LSB v1.3, yet they are used by
+ * GCC, presumably they aren't documented by LSB since they aren't
+ * used on Linux:
+ */
+#define DW_EH_PE_funcrel       0x40    /* start-of-procedure-relative */
+#define DW_EH_PE_aligned       0x50    /* aligned pointer */
+
+/* Flags intentionaly not handled, since they're not needed:
+ * #define DW_EH_PE_indirect      0x80
+ * #define DW_EH_PE_uleb128       0x01
+ * #define DW_EH_PE_udata2        0x02
+ * #define DW_EH_PE_sleb128       0x09
+ * #define DW_EH_PE_sdata2        0x0a
+ * #define DW_EH_PE_textrel       0x20
+ * #define DW_EH_PE_datarel       0x30
+ */
+
+struct unwind_info {
+       struct perf_sample      *sample;
+       struct machine          *machine;
+       struct thread           *thread;
+       u64                     sample_uregs;
+};
+
+#define dw_read(ptr, type, end) ({     \
+       type *__p = (type *) ptr;       \
+       type  __v;                      \
+       if ((__p + 1) > (type *) end)   \
+               return -EINVAL;         \
+       __v = *__p++;                   \
+       ptr = (typeof(ptr)) __p;        \
+       __v;                            \
+       })
+
+static int __dw_read_encoded_value(u8 **p, u8 *end, u64 *val,
+                                  u8 encoding)
+{
+       u8 *cur = *p;
+       *val = 0;
+
+       switch (encoding) {
+       case DW_EH_PE_omit:
+               *val = 0;
+               goto out;
+       case DW_EH_PE_ptr:
+               *val = dw_read(cur, unsigned long, end);
+               goto out;
+       default:
+               break;
+       }
+
+       switch (encoding & DW_EH_PE_APPL_MASK) {
+       case DW_EH_PE_absptr:
+               break;
+       case DW_EH_PE_pcrel:
+               *val = (unsigned long) cur;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       if ((encoding & 0x07) == 0x00)
+               encoding |= DW_EH_PE_udata4;
+
+       switch (encoding & DW_EH_PE_FORMAT_MASK) {
+       case DW_EH_PE_sdata4:
+               *val += dw_read(cur, s32, end);
+               break;
+       case DW_EH_PE_udata4:
+               *val += dw_read(cur, u32, end);
+               break;
+       case DW_EH_PE_sdata8:
+               *val += dw_read(cur, s64, end);
+               break;
+       case DW_EH_PE_udata8:
+               *val += dw_read(cur, u64, end);
+               break;
+       default:
+               return -EINVAL;
+       }
+
+ out:
+       *p = cur;
+       return 0;
+}
+
+#define dw_read_encoded_value(ptr, end, enc) ({                        \
+       u64 __v;                                                \
+       if (__dw_read_encoded_value(&ptr, end, &__v, enc)) {    \
+               return -EINVAL;                                 \
+       }                                                       \
+       __v;                                                    \
+       })
+
+static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
+                                   GElf_Shdr *shp, const char *name)
+{
+       Elf_Scn *sec = NULL;
+
+       while ((sec = elf_nextscn(elf, sec)) != NULL) {
+               char *str;
+
+               gelf_getshdr(sec, shp);
+               str = elf_strptr(elf, ep->e_shstrndx, shp->sh_name);
+               if (!strcmp(name, str))
+                       break;
+       }
+
+       return sec;
+}
+
+static u64 elf_section_offset(int fd, const char *name)
+{
+       Elf *elf;
+       GElf_Ehdr ehdr;
+       GElf_Shdr shdr;
+       u64 offset = 0;
+
+       elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
+       if (elf == NULL)
+               return 0;
+
+       do {
+               if (gelf_getehdr(elf, &ehdr) == NULL)
+                       break;
+
+               if (!elf_section_by_name(elf, &ehdr, &shdr, name))
+                       break;
+
+               offset = shdr.sh_offset;
+       } while (0);
+
+       elf_end(elf);
+       return offset;
+}
+
+struct table_entry {
+       u32 start_ip_offset;
+       u32 fde_offset;
+};
+
+struct eh_frame_hdr {
+       unsigned char version;
+       unsigned char eh_frame_ptr_enc;
+       unsigned char fde_count_enc;
+       unsigned char table_enc;
+
+       /*
+        * The rest of the header is variable-length and consists of the
+        * following members:
+        *
+        *      encoded_t eh_frame_ptr;
+        *      encoded_t fde_count;
+        */
+
+       /* A single encoded pointer should not be more than 8 bytes. */
+       u64 enc[2];
+
+       /*
+        * struct {
+        *    encoded_t start_ip;
+        *    encoded_t fde_addr;
+        * } binary_search_table[fde_count];
+        */
+       char data[0];
+} __packed;
+
+static int unwind_spec_ehframe(struct dso *dso, struct machine *machine,
+                              u64 offset, u64 *table_data, u64 *segbase,
+                              u64 *fde_count)
+{
+       struct eh_frame_hdr hdr;
+       u8 *enc = (u8 *) &hdr.enc;
+       u8 *end = (u8 *) &hdr.data;
+       ssize_t r;
+
+       r = dso__data_read_offset(dso, machine, offset,
+                                 (u8 *) &hdr, sizeof(hdr));
+       if (r != sizeof(hdr))
+               return -EINVAL;
+
+       /* We dont need eh_frame_ptr, just skip it. */
+       dw_read_encoded_value(enc, end, hdr.eh_frame_ptr_enc);
+
+       *fde_count  = dw_read_encoded_value(enc, end, hdr.fde_count_enc);
+       *segbase    = offset;
+       *table_data = (enc - (u8 *) &hdr) + offset;
+       return 0;
+}
+
+static int read_unwind_spec(struct dso *dso, struct machine *machine,
+                           u64 *table_data, u64 *segbase, u64 *fde_count)
+{
+       int ret = -EINVAL, fd;
+       u64 offset;
+
+       fd = dso__data_fd(dso, machine);
+       if (fd < 0)
+               return -EINVAL;
+
+       offset = elf_section_offset(fd, ".eh_frame_hdr");
+       close(fd);
+
+       if (offset)
+               ret = unwind_spec_ehframe(dso, machine, offset,
+                                         table_data, segbase,
+                                         fde_count);
+
+       /* TODO .debug_frame check if eh_frame_hdr fails */
+       return ret;
+}
+
+static struct map *find_map(unw_word_t ip, struct unwind_info *ui)
+{
+       struct addr_location al;
+
+       thread__find_addr_map(ui->thread, ui->machine, PERF_RECORD_MISC_USER,
+                             MAP__FUNCTION, ip, &al);
+       return al.map;
+}
+
+static int
+find_proc_info(unw_addr_space_t as, unw_word_t ip, unw_proc_info_t *pi,
+              int need_unwind_info, void *arg)
+{
+       struct unwind_info *ui = arg;
+       struct map *map;
+       unw_dyn_info_t di;
+       u64 table_data, segbase, fde_count;
+
+       map = find_map(ip, ui);
+       if (!map || !map->dso)
+               return -EINVAL;
+
+       pr_debug("unwind: find_proc_info dso %s\n", map->dso->name);
+
+       if (read_unwind_spec(map->dso, ui->machine,
+                            &table_data, &segbase, &fde_count))
+               return -EINVAL;
+
+       memset(&di, 0, sizeof(di));
+       di.format   = UNW_INFO_FORMAT_REMOTE_TABLE;
+       di.start_ip = map->start;
+       di.end_ip   = map->end;
+       di.u.rti.segbase    = map->start + segbase;
+       di.u.rti.table_data = map->start + table_data;
+       di.u.rti.table_len  = fde_count * sizeof(struct table_entry)
+                             / sizeof(unw_word_t);
+       return dwarf_search_unwind_table(as, ip, &di, pi,
+                                        need_unwind_info, arg);
+}
+
+static int access_fpreg(unw_addr_space_t __used as, unw_regnum_t __used num,
+                       unw_fpreg_t __used *val, int __used __write,
+                       void __used *arg)
+{
+       pr_err("unwind: access_fpreg unsupported\n");
+       return -UNW_EINVAL;
+}
+
+static int get_dyn_info_list_addr(unw_addr_space_t __used as,
+                                 unw_word_t __used *dil_addr,
+                                 void __used *arg)
+{
+       return -UNW_ENOINFO;
+}
+
+static int resume(unw_addr_space_t __used as, unw_cursor_t __used *cu,
+                 void __used *arg)
+{
+       pr_err("unwind: resume unsupported\n");
+       return -UNW_EINVAL;
+}
+
+static int
+get_proc_name(unw_addr_space_t __used as, unw_word_t __used addr,
+               char __used *bufp, size_t __used buf_len,
+               unw_word_t __used *offp, void __used *arg)
+{
+       pr_err("unwind: get_proc_name unsupported\n");
+       return -UNW_EINVAL;
+}
+
+static int access_dso_mem(struct unwind_info *ui, unw_word_t addr,
+                         unw_word_t *data)
+{
+       struct addr_location al;
+       ssize_t size;
+
+       thread__find_addr_map(ui->thread, ui->machine, PERF_RECORD_MISC_USER,
+                             MAP__FUNCTION, addr, &al);
+       if (!al.map) {
+               pr_debug("unwind: no map for %lx\n", (unsigned long)addr);
+               return -1;
+       }
+
+       if (!al.map->dso)
+               return -1;
+
+       size = dso__data_read_addr(al.map->dso, al.map, ui->machine,
+                                  addr, (u8 *) data, sizeof(*data));
+
+       return !(size == sizeof(*data));
+}
+
+static int reg_value(unw_word_t *valp, struct regs_dump *regs, int id,
+                    u64 sample_regs)
+{
+       int i, idx = 0;
+
+       if (!(sample_regs & (1 << id)))
+               return -EINVAL;
+
+       for (i = 0; i < id; i++) {
+               if (sample_regs & (1 << i))
+                       idx++;
+       }
+
+       *valp = regs->regs[idx];
+       return 0;
+}
+
+static int access_mem(unw_addr_space_t __used as,
+                     unw_word_t addr, unw_word_t *valp,
+                     int __write, void *arg)
+{
+       struct unwind_info *ui = arg;
+       struct stack_dump *stack = &ui->sample->user_stack;
+       unw_word_t start, end;
+       int offset;
+       int ret;
+
+       /* Don't support write, probably not needed. */
+       if (__write || !stack || !ui->sample->user_regs.regs) {
+               *valp = 0;
+               return 0;
+       }
+
+       ret = reg_value(&start, &ui->sample->user_regs, PERF_REG_SP,
+                       ui->sample_uregs);
+       if (ret)
+               return ret;
+
+       end = start + stack->size;
+
+       /* Check overflow. */
+       if (addr + sizeof(unw_word_t) < addr)
+               return -EINVAL;
+
+       if (addr < start || addr + sizeof(unw_word_t) >= end) {
+               ret = access_dso_mem(ui, addr, valp);
+               if (ret) {
+                       pr_debug("unwind: access_mem %p not inside range %p-%p\n",
+                               (void *)addr, (void *)start, (void *)end);
+                       *valp = 0;
+                       return ret;
+               }
+               return 0;
+       }
+
+       offset = addr - start;
+       *valp  = *(unw_word_t *)&stack->data[offset];
+       pr_debug("unwind: access_mem addr %p, val %lx, offset %d\n",
+                (void *)addr, (unsigned long)*valp, offset);
+       return 0;
+}
+
+static int access_reg(unw_addr_space_t __used as,
+                     unw_regnum_t regnum, unw_word_t *valp,
+                     int __write, void *arg)
+{
+       struct unwind_info *ui = arg;
+       int id, ret;
+
+       /* Don't support write, I suspect we don't need it. */
+       if (__write) {
+               pr_err("unwind: access_reg w %d\n", regnum);
+               return 0;
+       }
+
+       if (!ui->sample->user_regs.regs) {
+               *valp = 0;
+               return 0;
+       }
+
+       id = unwind__arch_reg_id(regnum);
+       if (id < 0)
+               return -EINVAL;
+
+       ret = reg_value(valp, &ui->sample->user_regs, id, ui->sample_uregs);
+       if (ret) {
+               pr_err("unwind: can't read reg %d\n", regnum);
+               return ret;
+       }
+
+       pr_debug("unwind: reg %d, val %lx\n", regnum, (unsigned long)*valp);
+       return 0;
+}
+
+static void put_unwind_info(unw_addr_space_t __used as,
+                           unw_proc_info_t *pi __used,
+                           void *arg __used)
+{
+       pr_debug("unwind: put_unwind_info called\n");
+}
+
+static int entry(u64 ip, struct thread *thread, struct machine *machine,
+                unwind_entry_cb_t cb, void *arg)
+{
+       struct unwind_entry e;
+       struct addr_location al;
+
+       thread__find_addr_location(thread, machine,
+                                  PERF_RECORD_MISC_USER,
+                                  MAP__FUNCTION, ip, &al, NULL);
+
+       e.ip = ip;
+       e.map = al.map;
+       e.sym = al.sym;
+
+       pr_debug("unwind: %s:ip = 0x%" PRIx64 " (0x%" PRIx64 ")\n",
+                al.sym ? al.sym->name : "''",
+                ip,
+                al.map ? al.map->map_ip(al.map, ip) : (u64) 0);
+
+       return cb(&e, arg);
+}
+
+static void display_error(int err)
+{
+       switch (err) {
+       case UNW_EINVAL:
+               pr_err("unwind: Only supports local.\n");
+               break;
+       case UNW_EUNSPEC:
+               pr_err("unwind: Unspecified error.\n");
+               break;
+       case UNW_EBADREG:
+               pr_err("unwind: Register unavailable.\n");
+               break;
+       default:
+               break;
+       }
+}
+
+static unw_accessors_t accessors = {
+       .find_proc_info         = find_proc_info,
+       .put_unwind_info        = put_unwind_info,
+       .get_dyn_info_list_addr = get_dyn_info_list_addr,
+       .access_mem             = access_mem,
+       .access_reg             = access_reg,
+       .access_fpreg           = access_fpreg,
+       .resume                 = resume,
+       .get_proc_name          = get_proc_name,
+};
+
+static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
+                      void *arg)
+{
+       unw_addr_space_t addr_space;
+       unw_cursor_t c;
+       int ret;
+
+       addr_space = unw_create_addr_space(&accessors, 0);
+       if (!addr_space) {
+               pr_err("unwind: Can't create unwind address space.\n");
+               return -ENOMEM;
+       }
+
+       ret = unw_init_remote(&c, addr_space, ui);
+       if (ret)
+               display_error(ret);
+
+       while (!ret && (unw_step(&c) > 0)) {
+               unw_word_t ip;
+
+               unw_get_reg(&c, UNW_REG_IP, &ip);
+               ret = entry(ip, ui->thread, ui->machine, cb, arg);
+       }
+
+       unw_destroy_addr_space(addr_space);
+       return ret;
+}
+
+int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
+                       struct machine *machine, struct thread *thread,
+                       u64 sample_uregs, struct perf_sample *data)
+{
+       unw_word_t ip;
+       struct unwind_info ui = {
+               .sample       = data,
+               .sample_uregs = sample_uregs,
+               .thread       = thread,
+               .machine      = machine,
+       };
+       int ret;
+
+       if (!data->user_regs.regs)
+               return -EINVAL;
+
+       ret = reg_value(&ip, &data->user_regs, PERF_REG_IP, sample_uregs);
+       if (ret)
+               return ret;
+
+       ret = entry(ip, thread, machine, cb, arg);
+       if (ret)
+               return -ENOMEM;
+
+       return get_entries(&ui, cb, arg);
+}
diff --git a/tools/perf/util/unwind.h b/tools/perf/util/unwind.h

new file mode 100644 (file)

index 0000000..919bd6a
--- /dev/null
+++ b/tools/perf/util/unwind.h
@@ -0,0 +1,34 @@
+#ifndef __UNWIND_H
+#define __UNWIND_H
+
+#include "types.h"
+#include "event.h"
+#include "symbol.h"
+
+struct unwind_entry {
+       struct map      *map;
+       struct symbol   *sym;
+       u64             ip;
+};
+
+typedef int (*unwind_entry_cb_t)(struct unwind_entry *entry, void *arg);
+
+#ifndef NO_LIBUNWIND_SUPPORT
+int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
+                       struct machine *machine,
+                       struct thread *thread,
+                       u64 sample_uregs,
+                       struct perf_sample *data);
+int unwind__arch_reg_id(int regnum);
+#else
+static inline int
+unwind__get_entries(unwind_entry_cb_t cb __used, void *arg __used,
+                   struct machine *machine __used,
+                   struct thread *thread __used,
+                   u64 sample_uregs __used,
+                   struct perf_sample *data __used)
+{
+       return 0;
+}
+#endif /* NO_LIBUNWIND_SUPPORT */
+#endif /* __UNWIND_H */
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c

index d03599fbe78bab080e59a096c3629b650bc12e84..1b8775c3707d483676c170bbf67d220d8ef1c14a 100644 (file)
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -1,6 +1,9 @@
  #include "../perf.h"
  #include "util.h"
  #include <sys/mman.h>
+#include <execinfo.h>
+#include <stdio.h>
+#include <stdlib.h>
  
  /*
   * XXX We need to find a better place for these things...
@@ -158,3 +161,19 @@ size_t hex_width(u64 v)
  
         return n;
  }
+
+/* Obtain a backtrace and print it to stdout. */
+void dump_stack(void)
+{
+       void *array[16];
+       size_t size = backtrace(array, ARRAY_SIZE(array));
+       char **strings = backtrace_symbols(array, size);
+       size_t i;
+
+       printf("Obtained %zd stack frames.\n", size);
+
+       for (i = 0; i < size; i++)
+               printf("%s\n", strings[i]);
+
+       free(strings);
+}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h

index b13c7331eaf8cbf89a8a1b6626238e44ab84d657..00a93a91a23542d04c29b906f7be866295bf7ab6 100644 (file)
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -266,4 +266,6 @@ size_t hex_width(u64 v);
  
  char *rtrim(char *s);
  
+void dump_stack(void);
+
  #endif
diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include

index bde8521d56bb37d8952a99a7d552cb6ae2b6e50e..96ce80a3743ba22e13935f238e9835aca8923326 100644 (file)
--- a/tools/scripts/Makefile.include
+++ b/tools/scripts/Makefile.include
@@ -1,6 +1,8 @@
  ifeq ("$(origin O)", "command line")
-       OUTPUT := $(O)/
-       COMMAND_O := O=$(O)
+       dummy := $(if $(shell test -d $(O) || echo $(O)),$(error O=$(O) does not exist),)
+       ABSOLUTE_O := $(shell cd $(O) ; pwd)
+       OUTPUT := $(ABSOLUTE_O)/
+       COMMAND_O := O=$(ABSOLUTE_O)
  endif
  
  ifneq ($(OUTPUT),)
author	Stephen Rothwell <sfr@canb.auug.org.au>
	Wed, 5 Sep 2012 03:16:58 +0000 (13:16 +1000)
committer	Stephen Rothwell <sfr@canb.auug.org.au>
	Wed, 5 Sep 2012 03:16:58 +0000 (13:16 +1000)