]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
Merge tag 'alternatives_padding' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorIngo Molnar <mingo@kernel.org>
Wed, 4 Mar 2015 05:33:49 +0000 (06:33 +0100)
committerIngo Molnar <mingo@kernel.org>
Wed, 4 Mar 2015 05:36:15 +0000 (06:36 +0100)
Pull alternative instructions framework improvements from Borislav Petkov:

 "A more involved rework of the alternatives framework to be able to
  pad instructions and thus make using the alternatives macros more
  straightforward and without having to figure out old and new instruction
  sizes but have the toolchain figure that out for us.

  Furthermore, it optimizes JMPs used so that fetch and decode can be
  relieved with smaller versions of the JMPs, where possible.

  Some stats:

    x86_64 defconfig:

    Alternatives sites total:               2478
    Total padding added (in Bytes):         6051

  The padding is currently done for:

    X86_FEATURE_ALWAYS
    X86_FEATURE_ERMS
    X86_FEATURE_LFENCE_RDTSC
    X86_FEATURE_MFENCE_RDTSC
    X86_FEATURE_SMAP

  This is with the latest version of the patchset. Of course, on each
  machine the alternatives sites actually being patched are a proper
  subset of the total number."

Signed-off-by: Ingo Molnar <mingo@kernel.org>
23 files changed:
arch/x86/include/asm/alternative-asm.h
arch/x86/include/asm/alternative.h
arch/x86/include/asm/apic.h
arch/x86/include/asm/barrier.h
arch/x86/include/asm/cpufeature.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/smap.h
arch/x86/kernel/alternative.c
arch/x86/kernel/cpu/amd.c
arch/x86/kernel/entry_32.S
arch/x86/lib/clear_page_64.S
arch/x86/lib/copy_page_64.S
arch/x86/lib/copy_user_64.S
arch/x86/lib/memcpy_64.S
arch/x86/lib/memmove_64.S
arch/x86/lib/memset_64.S
arch/x86/um/asm/barrier.h
tools/perf/bench/mem-memcpy-x86-64-asm-def.h
tools/perf/bench/mem-memcpy-x86-64-asm.S
tools/perf/bench/mem-memcpy.c
tools/perf/bench/mem-memset-x86-64-asm-def.h
tools/perf/bench/mem-memset-x86-64-asm.S
tools/perf/util/include/asm/alternative-asm.h

index 372231c22a47a46b1417e5c6739d88eb927f89fd..524bddce0b7608813c8a85c590c3fb70f5bf6ce1 100644 (file)
        .endm
 #endif
 
-.macro altinstruction_entry orig alt feature orig_len alt_len
+.macro altinstruction_entry orig alt feature orig_len alt_len pad_len
        .long \orig - .
        .long \alt - .
        .word \feature
        .byte \orig_len
        .byte \alt_len
+       .byte \pad_len
+.endm
+
+.macro ALTERNATIVE oldinstr, newinstr, feature
+140:
+       \oldinstr
+141:
+       .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
+142:
+
+       .pushsection .altinstructions,"a"
+       altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
+       .popsection
+
+       .pushsection .altinstr_replacement,"ax"
+143:
+       \newinstr
+144:
+       .popsection
+.endm
+
+.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
+140:
+       \oldinstr
+141:
+       .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
+       .skip -(((145f-144f)-(144f-143f)-(141b-140b)) > 0) * ((145f-144f)-(144f-143f)-(141b-140b)),0x90
+142:
+
+       .pushsection .altinstructions,"a"
+       altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b
+       altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b
+       .popsection
+
+       .pushsection .altinstr_replacement,"ax"
+143:
+       \newinstr1
+144:
+       \newinstr2
+145:
+       .popsection
 .endm
 
 #endif  /*  __ASSEMBLY__  */
index 473bdbee378a10ac2030b586dc33d3be74de5a11..5aef6a97d80e53ac215782aa88d577658d726c2e 100644 (file)
@@ -48,8 +48,9 @@ struct alt_instr {
        s32 repl_offset;        /* offset to replacement instruction */
        u16 cpuid;              /* cpuid bit set for replacement */
        u8  instrlen;           /* length of original instruction */
-       u8  replacementlen;     /* length of new instruction, <= instrlen */
-};
+       u8  replacementlen;     /* length of new instruction */
+       u8  padlen;             /* length of build-time padding */
+} __packed;
 
 extern void alternative_instructions(void);
 extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
@@ -76,50 +77,61 @@ static inline int alternatives_text_reserved(void *start, void *end)
 }
 #endif /* CONFIG_SMP */
 
-#define OLDINSTR(oldinstr)     "661:\n\t" oldinstr "\n662:\n"
+#define b_replacement(num)     "664"#num
+#define e_replacement(num)     "665"#num
 
-#define b_replacement(number)  "663"#number
-#define e_replacement(number)  "664"#number
+#define alt_end_marker         "663"
+#define alt_slen               "662b-661b"
+#define alt_pad_len            alt_end_marker"b-662b"
+#define alt_total_slen         alt_end_marker"b-661b"
+#define alt_rlen(num)          e_replacement(num)"f-"b_replacement(num)"f"
 
-#define alt_slen "662b-661b"
-#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f"
+#define __OLDINSTR(oldinstr, num)                                      \
+       "661:\n\t" oldinstr "\n662:\n"                                  \
+       ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * "          \
+               "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n"
 
-#define ALTINSTR_ENTRY(feature, number)                                              \
+#define OLDINSTR(oldinstr, num)                                                \
+       __OLDINSTR(oldinstr, num)                                       \
+       alt_end_marker ":\n"
+
+/*
+ * Pad the second replacement alternative with additional NOPs if it is
+ * additionally longer than the first replacement alternative.
+ */
+#define OLDINSTR_2(oldinstr, num1, num2)                                       \
+       __OLDINSTR(oldinstr, num1)                                              \
+       ".skip -(((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)) > 0) * " \
+               "((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)),0x90\n"  \
+       alt_end_marker ":\n"
+
+#define ALTINSTR_ENTRY(feature, num)                                         \
        " .long 661b - .\n"                             /* label           */ \
-       " .long " b_replacement(number)"f - .\n"        /* new instruction */ \
+       " .long " b_replacement(num)"f - .\n"           /* new instruction */ \
        " .word " __stringify(feature) "\n"             /* feature bit     */ \
-       " .byte " alt_slen "\n"                         /* source len      */ \
-       " .byte " alt_rlen(number) "\n"                 /* replacement len */
+       " .byte " alt_total_slen "\n"                   /* source len      */ \
+       " .byte " alt_rlen(num) "\n"                    /* replacement len */ \
+       " .byte " alt_pad_len "\n"                      /* pad len */
 
-#define DISCARD_ENTRY(number)                          /* rlen <= slen */    \
-       " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n"
-
-#define ALTINSTR_REPLACEMENT(newinstr, feature, number)        /* replacement */     \
-       b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t"
+#define ALTINSTR_REPLACEMENT(newinstr, feature, num)   /* replacement */     \
+       b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t"
 
 /* alternative assembly primitive: */
 #define ALTERNATIVE(oldinstr, newinstr, feature)                       \
-       OLDINSTR(oldinstr)                                              \
+       OLDINSTR(oldinstr, 1)                                           \
        ".pushsection .altinstructions,\"a\"\n"                         \
        ALTINSTR_ENTRY(feature, 1)                                      \
        ".popsection\n"                                                 \
-       ".pushsection .discard,\"aw\",@progbits\n"                      \
-       DISCARD_ENTRY(1)                                                \
-       ".popsection\n"                                                 \
        ".pushsection .altinstr_replacement, \"ax\"\n"                  \
        ALTINSTR_REPLACEMENT(newinstr, feature, 1)                      \
        ".popsection"
 
 #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
-       OLDINSTR(oldinstr)                                              \
+       OLDINSTR_2(oldinstr, 1, 2)                                      \
        ".pushsection .altinstructions,\"a\"\n"                         \
        ALTINSTR_ENTRY(feature1, 1)                                     \
        ALTINSTR_ENTRY(feature2, 2)                                     \
        ".popsection\n"                                                 \
-       ".pushsection .discard,\"aw\",@progbits\n"                      \
-       DISCARD_ENTRY(1)                                                \
-       DISCARD_ENTRY(2)                                                \
-       ".popsection\n"                                                 \
        ".pushsection .altinstr_replacement, \"ax\"\n"                  \
        ALTINSTR_REPLACEMENT(newinstr1, feature1, 1)                    \
        ALTINSTR_REPLACEMENT(newinstr2, feature2, 2)                    \
@@ -146,6 +158,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
 #define alternative(oldinstr, newinstr, feature)                       \
        asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
 
+#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
+       asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
+
 /*
  * Alternative inline assembly with input.
  *
index efc3b22d896eb23b7e37cf9c720065c0b6b0c717..8118e94d50ab8f1d8efaf4dc9a45372007566a9b 100644 (file)
@@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v)
 {
        volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
 
-       alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP,
+       alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP,
                       ASM_OUTPUT2("=r" (v), "=m" (*addr)),
                       ASM_OUTPUT2("0" (v), "m" (*addr)));
 }
index 2ab1eb33106eec42eff90d27b98cb698b5c4c835..959e45b81fe29192b0f1c97a65e028e7314f603d 100644 (file)
@@ -95,13 +95,11 @@ do {                                                                        \
  * Stop RDTSC speculation. This is needed when you need to use RDTSC
  * (or get_cycles or vread that possibly accesses the TSC) in a defined
  * code region.
- *
- * (Could use an alternative three way for this if there was one.)
  */
 static __always_inline void rdtsc_barrier(void)
 {
-       alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
-       alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+       alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+                         "lfence", X86_FEATURE_LFENCE_RDTSC);
 }
 
 #endif /* _ASM_X86_BARRIER_H */
index d6428ea5d316fdf92450eef98310a1aec153a16a..0f7a5a1a8db28f0af95ee95a0422891155b9cc91 100644 (file)
@@ -419,6 +419,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
                         " .word %P0\n"         /* 1: do replace */
                         " .byte 2b - 1b\n"     /* source len */
                         " .byte 0\n"           /* replacement len */
+                        " .byte 0\n"           /* pad len */
                         ".previous\n"
                         /* skipping size check since replacement size = 0 */
                         : : "i" (X86_FEATURE_ALWAYS) : : t_warn);
@@ -433,6 +434,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
                         " .word %P0\n"         /* feature bit */
                         " .byte 2b - 1b\n"     /* source len */
                         " .byte 0\n"           /* replacement len */
+                        " .byte 0\n"           /* pad len */
                         ".previous\n"
                         /* skipping size check since replacement size = 0 */
                         : : "i" (bit) : : t_no);
@@ -458,6 +460,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
                             " .word %P1\n"             /* feature bit */
                             " .byte 2b - 1b\n"         /* source len */
                             " .byte 4f - 3f\n"         /* replacement len */
+                            " .byte 0\n"               /* pad len */
                             ".previous\n"
                             ".section .discard,\"aw\",@progbits\n"
                             " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -484,31 +487,30 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
 static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 {
 #ifdef CC_HAVE_ASM_GOTO
-/*
- * We need to spell the jumps to the compiler because, depending on the offset,
- * the replacement jump can be bigger than the original jump, and this we cannot
- * have. Thus, we force the jump to the widest, 4-byte, signed relative
- * offset even though the last would often fit in less bytes.
- */
-               asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n"
+               asm_volatile_goto("1: jmp %l[t_dynamic]\n"
                         "2:\n"
+                        ".skip -(((5f-4f) - (2b-1b)) > 0) * "
+                                "((5f-4f) - (2b-1b)),0x90\n"
+                        "3:\n"
                         ".section .altinstructions,\"a\"\n"
                         " .long 1b - .\n"              /* src offset */
-                        " .long 3f - .\n"              /* repl offset */
+                        " .long 4f - .\n"              /* repl offset */
                         " .word %P1\n"                 /* always replace */
-                        " .byte 2b - 1b\n"             /* src len */
-                        " .byte 4f - 3f\n"             /* repl len */
+                        " .byte 3b - 1b\n"             /* src len */
+                        " .byte 5f - 4f\n"             /* repl len */
+                        " .byte 3b - 2b\n"             /* pad len */
                         ".previous\n"
                         ".section .altinstr_replacement,\"ax\"\n"
-                        "3: .byte 0xe9\n .long %l[t_no] - 2b\n"
-                        "4:\n"
+                        "4: jmp %l[t_no]\n"
+                        "5:\n"
                         ".previous\n"
                         ".section .altinstructions,\"a\"\n"
                         " .long 1b - .\n"              /* src offset */
                         " .long 0\n"                   /* no replacement */
                         " .word %P0\n"                 /* feature bit */
-                        " .byte 2b - 1b\n"             /* src len */
+                        " .byte 3b - 1b\n"             /* src len */
                         " .byte 0\n"                   /* repl len */
+                        " .byte 0\n"                   /* pad len */
                         ".previous\n"
                         : : "i" (bit), "i" (X86_FEATURE_ALWAYS)
                         : : t_dynamic, t_no);
@@ -528,6 +530,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
                             " .word %P2\n"             /* always replace */
                             " .byte 2b - 1b\n"         /* source len */
                             " .byte 4f - 3f\n"         /* replacement len */
+                            " .byte 0\n"               /* pad len */
                             ".previous\n"
                             ".section .discard,\"aw\",@progbits\n"
                             " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -542,6 +545,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
                             " .word %P1\n"             /* feature bit */
                             " .byte 4b - 3b\n"         /* src len */
                             " .byte 6f - 5f\n"         /* repl len */
+                            " .byte 0\n"               /* pad len */
                             ".previous\n"
                             ".section .discard,\"aw\",@progbits\n"
                             " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */
index ec1c93588cefd0c4e3a4705c6d966e2555ba95aa..7be2c9a6caba9c4b916237c7113679a505ef2351 100644 (file)
@@ -761,10 +761,10 @@ extern char                       ignore_fpu_irq;
 #define ARCH_HAS_SPINLOCK_PREFETCH
 
 #ifdef CONFIG_X86_32
-# define BASE_PREFETCH         ASM_NOP4
+# define BASE_PREFETCH         ""
 # define ARCH_HAS_PREFETCH
 #else
-# define BASE_PREFETCH         "prefetcht0 (%1)"
+# define BASE_PREFETCH         "prefetcht0 %P1"
 #endif
 
 /*
@@ -775,10 +775,9 @@ extern char                        ignore_fpu_irq;
  */
 static inline void prefetch(const void *x)
 {
-       alternative_input(BASE_PREFETCH,
-                         "prefetchnta (%1)",
+       alternative_input(BASE_PREFETCH, "prefetchnta %P1",
                          X86_FEATURE_XMM,
-                         "r" (x));
+                         "m" (*(const char *)x));
 }
 
 /*
@@ -788,10 +787,9 @@ static inline void prefetch(const void *x)
  */
 static inline void prefetchw(const void *x)
 {
-       alternative_input(BASE_PREFETCH,
-                         "prefetchw (%1)",
-                         X86_FEATURE_3DNOW,
-                         "r" (x));
+       alternative_input(BASE_PREFETCH, "prefetchw %P1",
+                         X86_FEATURE_3DNOWPREFETCH,
+                         "m" (*(const char *)x));
 }
 
 static inline void spin_lock_prefetch(const void *x)
index 8d3120f4e27053b3fae6d6334fe7a3a6dee33e82..ba665ebd17bb8f22a39712dbf2a8c398b9e9207f 100644 (file)
 
 #ifdef CONFIG_X86_SMAP
 
-#define ASM_CLAC                                                       \
-       661: ASM_NOP3 ;                                                 \
-       .pushsection .altinstr_replacement, "ax" ;                      \
-       662: __ASM_CLAC ;                                               \
-       .popsection ;                                                   \
-       .pushsection .altinstructions, "a" ;                            \
-       altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ;       \
-       .popsection
-
-#define ASM_STAC                                                       \
-       661: ASM_NOP3 ;                                                 \
-       .pushsection .altinstr_replacement, "ax" ;                      \
-       662: __ASM_STAC ;                                               \
-       .popsection ;                                                   \
-       .pushsection .altinstructions, "a" ;                            \
-       altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ;       \
-       .popsection
+#define ASM_CLAC \
+       ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP
+
+#define ASM_STAC \
+       ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP
 
 #else /* CONFIG_X86_SMAP */
 
 static __always_inline void clac(void)
 {
        /* Note: a barrier is implicit in alternative() */
-       alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
+       alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
 }
 
 static __always_inline void stac(void)
 {
        /* Note: a barrier is implicit in alternative() */
-       alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP);
+       alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP);
 }
 
 /* These macros can be used in asm() statements */
 #define ASM_CLAC \
-       ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
+       ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
 #define ASM_STAC \
-       ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP)
+       ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP)
 
 #else /* CONFIG_X86_SMAP */
 
index 703130f469ecf71978b9d67bf0fb50da9b31cbc9..af397cc98d05bba029f67e0dcb5e49783107d53f 100644 (file)
@@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str)
 __setup("noreplace-paravirt", setup_noreplace_paravirt);
 #endif
 
-#define DPRINTK(fmt, ...)                              \
-do {                                                   \
-       if (debug_alternative)                          \
-               printk(KERN_DEBUG fmt, ##__VA_ARGS__);  \
+#define DPRINTK(fmt, args...)                                          \
+do {                                                                   \
+       if (debug_alternative)                                          \
+               printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args);   \
+} while (0)
+
+#define DUMP_BYTES(buf, len, fmt, args...)                             \
+do {                                                                   \
+       if (unlikely(debug_alternative)) {                              \
+               int j;                                                  \
+                                                                       \
+               if (!(len))                                             \
+                       break;                                          \
+                                                                       \
+               printk(KERN_DEBUG fmt, ##args);                         \
+               for (j = 0; j < (len) - 1; j++)                         \
+                       printk(KERN_CONT "%02hhx ", buf[j]);            \
+               printk(KERN_CONT "%02hhx\n", buf[j]);                   \
+       }                                                               \
 } while (0)
 
 /*
@@ -243,12 +258,86 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern s32 __smp_locks[], __smp_locks_end[];
 void *text_poke_early(void *addr, const void *opcode, size_t len);
 
-/* Replace instructions with better alternatives for this CPU type.
-   This runs before SMP is initialized to avoid SMP problems with
-   self modifying code. This implies that asymmetric systems where
-   APs have less capabilities than the boot processor are not handled.
-   Tough. Make sure you disable such features by hand. */
+/*
+ * Are we looking at a near JMP with a 1 or 4-byte displacement.
+ */
+static inline bool is_jmp(const u8 opcode)
+{
+       return opcode == 0xeb || opcode == 0xe9;
+}
+
+static void __init_or_module
+recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
+{
+       u8 *next_rip, *tgt_rip;
+       s32 n_dspl, o_dspl;
+       int repl_len;
+
+       if (a->replacementlen != 5)
+               return;
+
+       o_dspl = *(s32 *)(insnbuf + 1);
+
+       /* next_rip of the replacement JMP */
+       next_rip = repl_insn + a->replacementlen;
+       /* target rip of the replacement JMP */
+       tgt_rip  = next_rip + o_dspl;
+       n_dspl = tgt_rip - orig_insn;
+
+       DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
+
+       if (tgt_rip - orig_insn >= 0) {
+               if (n_dspl - 2 <= 127)
+                       goto two_byte_jmp;
+               else
+                       goto five_byte_jmp;
+       /* negative offset */
+       } else {
+               if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
+                       goto two_byte_jmp;
+               else
+                       goto five_byte_jmp;
+       }
+
+two_byte_jmp:
+       n_dspl -= 2;
+
+       insnbuf[0] = 0xeb;
+       insnbuf[1] = (s8)n_dspl;
+       add_nops(insnbuf + 2, 3);
+
+       repl_len = 2;
+       goto done;
+
+five_byte_jmp:
+       n_dspl -= 5;
 
+       insnbuf[0] = 0xe9;
+       *(s32 *)&insnbuf[1] = n_dspl;
+
+       repl_len = 5;
+
+done:
+
+       DPRINTK("final displ: 0x%08x, JMP 0x%lx",
+               n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
+}
+
+static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
+{
+       add_nops(instr + (a->instrlen - a->padlen), a->padlen);
+
+       DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
+                  instr, a->instrlen - a->padlen, a->padlen);
+}
+
+/*
+ * Replace instructions with better alternatives for this CPU type. This runs
+ * before SMP is initialized to avoid SMP problems with self modifying code.
+ * This implies that asymmetric systems where APs have less capabilities than
+ * the boot processor are not handled. Tough. Make sure you disable such
+ * features by hand.
+ */
 void __init_or_module apply_alternatives(struct alt_instr *start,
                                         struct alt_instr *end)
 {
@@ -256,10 +345,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
        u8 *instr, *replacement;
        u8 insnbuf[MAX_PATCH_LEN];
 
-       DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
+       DPRINTK("alt table %p -> %p", start, end);
        /*
         * The scan order should be from start to end. A later scanned
-        * alternative code can overwrite a previous scanned alternative code.
+        * alternative code can overwrite previously scanned alternative code.
         * Some kernel functions (e.g. memcpy, memset, etc) use this order to
         * patch code.
         *
@@ -267,29 +356,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
         * order.
         */
        for (a = start; a < end; a++) {
+               int insnbuf_sz = 0;
+
                instr = (u8 *)&a->instr_offset + a->instr_offset;
                replacement = (u8 *)&a->repl_offset + a->repl_offset;
-               BUG_ON(a->replacementlen > a->instrlen);
                BUG_ON(a->instrlen > sizeof(insnbuf));
                BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
-               if (!boot_cpu_has(a->cpuid))
+               if (!boot_cpu_has(a->cpuid)) {
+                       if (a->padlen > 1)
+                               optimize_nops(a, instr);
+
                        continue;
+               }
+
+               DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d)",
+                       a->cpuid >> 5,
+                       a->cpuid & 0x1f,
+                       instr, a->instrlen,
+                       replacement, a->replacementlen);
+
+               DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
+               DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
 
                memcpy(insnbuf, replacement, a->replacementlen);
+               insnbuf_sz = a->replacementlen;
 
                /* 0xe8 is a relative jump; fix the offset. */
-               if (*insnbuf == 0xe8 && a->replacementlen == 5)
-                   *(s32 *)(insnbuf + 1) += replacement - instr;
+               if (*insnbuf == 0xe8 && a->replacementlen == 5) {
+                       *(s32 *)(insnbuf + 1) += replacement - instr;
+                       DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
+                               *(s32 *)(insnbuf + 1),
+                               (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
+               }
+
+               if (a->replacementlen && is_jmp(replacement[0]))
+                       recompute_jump(a, instr, replacement, insnbuf);
 
-               add_nops(insnbuf + a->replacementlen,
-                        a->instrlen - a->replacementlen);
+               if (a->instrlen > a->replacementlen) {
+                       add_nops(insnbuf + a->replacementlen,
+                                a->instrlen - a->replacementlen);
+                       insnbuf_sz += a->instrlen - a->replacementlen;
+               }
+               DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
 
-               text_poke_early(instr, insnbuf, a->instrlen);
+               text_poke_early(instr, insnbuf, insnbuf_sz);
        }
 }
 
 #ifdef CONFIG_SMP
-
 static void alternatives_smp_lock(const s32 *start, const s32 *end,
                                  u8 *text, u8 *text_end)
 {
@@ -371,8 +485,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
        smp->locks_end  = locks_end;
        smp->text       = text;
        smp->text_end   = text_end;
-       DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
-               __func__, smp->locks, smp->locks_end,
+       DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
+               smp->locks, smp->locks_end,
                smp->text, smp->text_end, smp->name);
 
        list_add_tail(&smp->next, &smp_alt_modules);
@@ -440,7 +554,7 @@ int alternatives_text_reserved(void *start, void *end)
 
        return 0;
 }
-#endif
+#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_PARAVIRT
 void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
index a220239cea65ca99b3c52ebbfaed6ce973f0dec1..dd9e50500297ef596e549c17db2cbd5a630e9756 100644 (file)
@@ -711,6 +711,11 @@ static void init_amd(struct cpuinfo_x86 *c)
                set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
 
        rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
+       /* 3DNow or LM implies PREFETCHW */
+       if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
+               if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
+                       set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
 }
 
 #ifdef CONFIG_X86_32
index 31e2d5bf3e38887ca06402bff6c647b9aa9a3c5c..7e0323cc0b7def209546af2dd99d1b26f7d2b53d 100644 (file)
@@ -816,15 +816,9 @@ ENTRY(simd_coprocessor_error)
        pushl_cfi $0
 #ifdef CONFIG_X86_INVD_BUG
        /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-661:   pushl_cfi $do_general_protection
-662:
-.section .altinstructions,"a"
-       altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
-.previous
-.section .altinstr_replacement,"ax"
-663:   pushl $do_simd_coprocessor_error
-664:
-.previous
+       ALTERNATIVE "pushl_cfi $do_general_protection", \
+                   "pushl $do_simd_coprocessor_error", \
+                   X86_FEATURE_XMM
 #else
        pushl_cfi $do_simd_coprocessor_error
 #endif
index f2145cfa12a66830e834718340c4cc88e64a731a..e67e579c93bdf7f3d0737565ea106edeeefb3b6d 100644 (file)
@@ -1,31 +1,35 @@
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
 /*
- * Zero a page.        
- * rdi page
- */                    
-ENTRY(clear_page_c)
+ * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
+ * recommended to use this when possible and we do use them by default.
+ * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
+ * Otherwise, use original.
+ */
+
+/*
+ * Zero a page.
+ * %rdi        - page
+ */
+ENTRY(clear_page)
        CFI_STARTPROC
+
+       ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
+                     "jmp clear_page_c_e", X86_FEATURE_ERMS
+
        movl $4096/8,%ecx
        xorl %eax,%eax
        rep stosq
        ret
        CFI_ENDPROC
-ENDPROC(clear_page_c)
+ENDPROC(clear_page)
 
-ENTRY(clear_page_c_e)
+ENTRY(clear_page_orig)
        CFI_STARTPROC
-       movl $4096,%ecx
-       xorl %eax,%eax
-       rep stosb
-       ret
-       CFI_ENDPROC
-ENDPROC(clear_page_c_e)
 
-ENTRY(clear_page)
-       CFI_STARTPROC
        xorl   %eax,%eax
        movl   $4096/64,%ecx
        .p2align 4
@@ -45,29 +49,13 @@ ENTRY(clear_page)
        nop
        ret
        CFI_ENDPROC
-.Lclear_page_end:
-ENDPROC(clear_page)
-
-       /*
-        * Some CPUs support enhanced REP MOVSB/STOSB instructions.
-        * It is recommended to use this when possible.
-        * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
-        * Otherwise, use original function.
-        *
-        */
+ENDPROC(clear_page_orig)
 
-#include <asm/cpufeature.h>
-
-       .section .altinstr_replacement,"ax"
-1:     .byte 0xeb                                      /* jmp <disp8> */
-       .byte (clear_page_c - clear_page) - (2f - 1b)   /* offset */
-2:     .byte 0xeb                                      /* jmp <disp8> */
-       .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */
-3:
-       .previous
-       .section .altinstructions,"a"
-       altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
-                            .Lclear_page_end-clear_page, 2b-1b
-       altinstruction_entry clear_page,2b,X86_FEATURE_ERMS,   \
-                            .Lclear_page_end-clear_page,3b-2b
-       .previous
+ENTRY(clear_page_c_e)
+       CFI_STARTPROC
+       movl $4096,%ecx
+       xorl %eax,%eax
+       rep stosb
+       ret
+       CFI_ENDPROC
+ENDPROC(clear_page_c_e)
index 176cca67212b7072687f42450ef56018c03ebaaa..8239dbcbf98455a99a125953392114f07a09aa74 100644 (file)
@@ -2,23 +2,26 @@
 
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
+/*
+ * Some CPUs run faster using the string copy instructions (sane microcode).
+ * It is also a lot simpler. Use this when possible. But, don't use streaming
+ * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the
+ * prefetch distance based on SMP/UP.
+ */
        ALIGN
-copy_page_rep:
+ENTRY(copy_page)
        CFI_STARTPROC
+       ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
        movl    $4096/8, %ecx
        rep     movsq
        ret
        CFI_ENDPROC
-ENDPROC(copy_page_rep)
-
-/*
- *  Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
- *  Could vary the prefetch distance based on SMP/UP.
-*/
+ENDPROC(copy_page)
 
-ENTRY(copy_page)
+ENTRY(copy_page_regs)
        CFI_STARTPROC
        subq    $2*8,   %rsp
        CFI_ADJUST_CFA_OFFSET 2*8
@@ -90,21 +93,5 @@ ENTRY(copy_page)
        addq    $2*8, %rsp
        CFI_ADJUST_CFA_OFFSET -2*8
        ret
-.Lcopy_page_end:
        CFI_ENDPROC
-ENDPROC(copy_page)
-
-       /* Some CPUs run faster using the string copy instructions.
-          It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
-       .section .altinstr_replacement,"ax"
-1:     .byte 0xeb                                      /* jmp <disp8> */
-       .byte (copy_page_rep - copy_page) - (2f - 1b)   /* offset */
-2:
-       .previous
-       .section .altinstructions,"a"
-       altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD,       \
-               .Lcopy_page_end-copy_page, 2b-1b
-       .previous
+ENDPROC(copy_page_regs)
index dee945d555941a078f40049b24b8b4731aba6f2c..fa997dfaef242fa9abdb28c20658a939caf72697 100644 (file)
@@ -8,9 +8,6 @@
 
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
-
-#define FIX_ALIGNMENT 1
-
 #include <asm/current.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
 
-/*
- * By placing feature2 after feature1 in altinstructions section, we logically
- * implement:
- * If CPU has feature2, jmp to alt2 is used
- * else if CPU has feature1, jmp to alt1 is used
- * else jmp to orig is used.
- */
-       .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
-0:
-       .byte 0xe9      /* 32bit jump */
-       .long \orig-1f  /* by default jump to orig */
-1:
-       .section .altinstr_replacement,"ax"
-2:     .byte 0xe9                      /* near jump with 32bit immediate */
-       .long \alt1-1b /* offset */   /* or alternatively to alt1 */
-3:     .byte 0xe9                      /* near jump with 32bit immediate */
-       .long \alt2-1b /* offset */   /* or alternatively to alt2 */
-       .previous
-
-       .section .altinstructions,"a"
-       altinstruction_entry 0b,2b,\feature1,5,5
-       altinstruction_entry 0b,3b,\feature2,5,5
-       .previous
-       .endm
-
        .macro ALIGN_DESTINATION
-#ifdef FIX_ALIGNMENT
        /* check for bad alignment of destination */
        movl %edi,%ecx
        andl $7,%ecx
@@ -67,7 +38,6 @@
 
        _ASM_EXTABLE(100b,103b)
        _ASM_EXTABLE(101b,103b)
-#endif
        .endm
 
 /* Standard copy_to_user with segment limit checking */
@@ -79,9 +49,11 @@ ENTRY(_copy_to_user)
        jc bad_to_user
        cmpq TI_addr_limit(%rax),%rcx
        ja bad_to_user
-       ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
-               copy_user_generic_unrolled,copy_user_generic_string,    \
-               copy_user_enhanced_fast_string
+       ALTERNATIVE_2 "jmp copy_user_generic_unrolled",         \
+                     "jmp copy_user_generic_string",           \
+                     X86_FEATURE_REP_GOOD,                     \
+                     "jmp copy_user_enhanced_fast_string",     \
+                     X86_FEATURE_ERMS
        CFI_ENDPROC
 ENDPROC(_copy_to_user)
 
@@ -94,9 +66,11 @@ ENTRY(_copy_from_user)
        jc bad_from_user
        cmpq TI_addr_limit(%rax),%rcx
        ja bad_from_user
-       ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
-               copy_user_generic_unrolled,copy_user_generic_string,    \
-               copy_user_enhanced_fast_string
+       ALTERNATIVE_2 "jmp copy_user_generic_unrolled",         \
+                     "jmp copy_user_generic_string",           \
+                     X86_FEATURE_REP_GOOD,                     \
+                     "jmp copy_user_enhanced_fast_string",     \
+                     X86_FEATURE_ERMS
        CFI_ENDPROC
 ENDPROC(_copy_from_user)
 
index 89b53c9968e7d50c1dd129dd92272f1a47edefe5..b046664f5a1ccf37f3c94b2ed7d8d5b3298e89bf 100644 (file)
@@ -1,11 +1,19 @@
 /* Copyright 2002 Andi Kleen */
 
 #include <linux/linkage.h>
-
 #include <asm/cpufeature.h>
 #include <asm/dwarf2.h>
 #include <asm/alternative-asm.h>
 
+/*
+ * We build a jump to memcpy_orig by default which gets NOPped out on
+ * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
+ * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
+ * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
+ */
+
+.weak memcpy
+
 /*
  * memcpy - Copy a memory block.
  *
  * Output:
  * rax original destination
  */
+ENTRY(__memcpy)
+ENTRY(memcpy)
+       ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
+                     "jmp memcpy_erms", X86_FEATURE_ERMS
 
-/*
- * memcpy_c() - fast string ops (REP MOVSQ) based variant.
- *
- * This gets patched over the unrolled variant (below) via the
- * alternative instructions framework:
- */
-       .section .altinstr_replacement, "ax", @progbits
-.Lmemcpy_c:
        movq %rdi, %rax
        movq %rdx, %rcx
        shrq $3, %rcx
        movl %edx, %ecx
        rep movsb
        ret
-.Lmemcpy_e:
-       .previous
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
 
 /*
- * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
- * memcpy_c. Use memcpy_c_e when possible.
- *
- * This gets patched over the unrolled variant (below) via the
- * alternative instructions framework:
+ * memcpy_erms() - enhanced fast string memcpy. This is faster and
+ * simpler than memcpy. Use memcpy_erms when possible.
  */
-       .section .altinstr_replacement, "ax", @progbits
-.Lmemcpy_c_e:
+ENTRY(memcpy_erms)
        movq %rdi, %rax
        movq %rdx, %rcx
        rep movsb
        ret
-.Lmemcpy_e_e:
-       .previous
-
-.weak memcpy
+ENDPROC(memcpy_erms)
 
-ENTRY(__memcpy)
-ENTRY(memcpy)
+ENTRY(memcpy_orig)
        CFI_STARTPROC
        movq %rdi, %rax
 
@@ -183,26 +179,4 @@ ENTRY(memcpy)
 .Lend:
        retq
        CFI_ENDPROC
-ENDPROC(memcpy)
-ENDPROC(__memcpy)
-
-       /*
-        * Some CPUs are adding enhanced REP MOVSB/STOSB feature
-        * If the feature is supported, memcpy_c_e() is the first choice.
-        * If enhanced rep movsb copy is not available, use fast string copy
-        * memcpy_c() when possible. This is faster and code is simpler than
-        * original memcpy().
-        * Otherwise, original memcpy() is used.
-        * In .altinstructions section, ERMS feature is placed after REG_GOOD
-         * feature to implement the right patch order.
-        *
-        * Replace only beginning, memcpy is used to apply alternatives,
-        * so it is silly to overwrite itself with nops - reboot is the
-        * only outcome...
-        */
-       .section .altinstructions, "a"
-       altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
-                            .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
-       altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
-                            .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
-       .previous
+ENDPROC(memcpy_orig)
index 9c4b530575da6e9b70cb8505f1e4b391b6fdff02..0f8a0d0331b91715238f01e8f54ce74525fb0cc4 100644 (file)
@@ -5,7 +5,6 @@
  * This assembly file is re-written from memmove_64.c file.
  *     - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
  */
-#define _STRING_C
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
 #include <asm/cpufeature.h>
@@ -44,6 +43,8 @@ ENTRY(__memmove)
        jg 2f
 
 .Lmemmove_begin_forward:
+       ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
+
        /*
         * movsq instruction have many startup latency
         * so we handle small size by general register.
@@ -207,21 +208,5 @@ ENTRY(__memmove)
 13:
        retq
        CFI_ENDPROC
-
-       .section .altinstr_replacement,"ax"
-.Lmemmove_begin_forward_efs:
-       /* Forward moving data. */
-       movq %rdx, %rcx
-       rep movsb
-       retq
-.Lmemmove_end_forward_efs:
-       .previous
-
-       .section .altinstructions,"a"
-       altinstruction_entry .Lmemmove_begin_forward,           \
-               .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS,   \
-               .Lmemmove_end_forward-.Lmemmove_begin_forward,  \
-               .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
-       .previous
 ENDPROC(__memmove)
 ENDPROC(memmove)
index 6f44935c6a606a1607022e889cc60ea151e54b65..93118fb239762ba78efd754d20a756aed0221411 100644 (file)
@@ -5,19 +5,30 @@
 #include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
+.weak memset
+
 /*
  * ISO C memset - set a memory block to a byte value. This function uses fast
  * string to get better performance than the original function. The code is
  * simpler and shorter than the orignal function as well.
- *     
+ *
  * rdi   destination
- * rsi   value (char) 
- * rdx   count (bytes) 
- * 
+ * rsi   value (char)
+ * rdx   count (bytes)
+ *
  * rax   original destination
- */    
-       .section .altinstr_replacement, "ax", @progbits
-.Lmemset_c:
+ */
+ENTRY(memset)
+ENTRY(__memset)
+       /*
+        * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
+        * to use it when possible. If not available, use fast string instructions.
+        *
+        * Otherwise, use original memset function.
+        */
+       ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
+                     "jmp memset_erms", X86_FEATURE_ERMS
+
        movq %rdi,%r9
        movq %rdx,%rcx
        andl $7,%edx
@@ -31,8 +42,8 @@
        rep stosb
        movq %r9,%rax
        ret
-.Lmemset_e:
-       .previous
+ENDPROC(memset)
+ENDPROC(__memset)
 
 /*
  * ISO C memset - set a memory block to a byte value. This function uses
  *
  * rax   original destination
  */
-       .section .altinstr_replacement, "ax", @progbits
-.Lmemset_c_e:
+ENTRY(memset_erms)
        movq %rdi,%r9
        movb %sil,%al
        movq %rdx,%rcx
        rep stosb
        movq %r9,%rax
        ret
-.Lmemset_e_e:
-       .previous
-
-.weak memset
+ENDPROC(memset_erms)
 
-ENTRY(memset)
-ENTRY(__memset)
+ENTRY(memset_orig)
        CFI_STARTPROC
        movq %rdi,%r10
 
@@ -134,23 +140,4 @@ ENTRY(__memset)
        jmp .Lafter_bad_alignment
 .Lfinal:
        CFI_ENDPROC
-ENDPROC(memset)
-ENDPROC(__memset)
-
-       /* Some CPUs support enhanced REP MOVSB/STOSB feature.
-        * It is recommended to use this when possible.
-        *
-        * If enhanced REP MOVSB/STOSB feature is not available, use fast string
-        * instructions.
-        *
-        * Otherwise, use original memset function.
-        *
-        * In .altinstructions section, ERMS feature is placed after REG_GOOD
-         * feature to implement the right patch order.
-        */
-       .section .altinstructions,"a"
-       altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
-                            .Lfinal-__memset,.Lmemset_e-.Lmemset_c
-       altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
-                            .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e
-       .previous
+ENDPROC(memset_orig)
index 2d7d9a1f5b531ee399b7a0e6a1f9eb4a190fd820..8ffd2146fa6a0367be6710913b5c4161f1a8859b 100644 (file)
@@ -64,8 +64,8 @@
  */
 static inline void rdtsc_barrier(void)
 {
-       alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
-       alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+       alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+                         "lfence", X86_FEATURE_LFENCE_RDTSC);
 }
 
 #endif
index d66ab799b35fd5cab83e5486368e40c39c2927ee..8c0c1a2770c8fe01bb479c97847540befe164011 100644 (file)
@@ -1,12 +1,12 @@
 
-MEMCPY_FN(__memcpy,
+MEMCPY_FN(memcpy_orig,
        "x86-64-unrolled",
        "unrolled memcpy() in arch/x86/lib/memcpy_64.S")
 
-MEMCPY_FN(memcpy_c,
+MEMCPY_FN(__memcpy,
        "x86-64-movsq",
        "movsq-based memcpy() in arch/x86/lib/memcpy_64.S")
 
-MEMCPY_FN(memcpy_c_e,
+MEMCPY_FN(memcpy_erms,
        "x86-64-movsb",
        "movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
index fcd9cf00600a970677dd331cfc7f71de0d67d00e..e4c2c30143b95133913a2bc3569b2871b79aa75c 100644 (file)
@@ -1,8 +1,6 @@
 #define memcpy MEMCPY /* don't hide glibc's memcpy() */
 #define altinstr_replacement text
 #define globl p2align 4; .globl
-#define Lmemcpy_c globl memcpy_c; memcpy_c
-#define Lmemcpy_c_e globl memcpy_c_e; memcpy_c_e
 #include "../../../arch/x86/lib/memcpy_64.S"
 /*
  * We need to provide note.GNU-stack section, saying that we want
index db1d3a29d97fec67c47a4dd84eca2f7779ec1865..d3dfb7936dcdfd688d20ae680b3b0c0009cdeb55 100644 (file)
@@ -36,7 +36,7 @@ static const struct option options[] = {
                    "Specify length of memory to copy. "
                    "Available units: B, KB, MB, GB and TB (upper and lower)"),
        OPT_STRING('r', "routine", &routine, "default",
-                   "Specify routine to copy"),
+                   "Specify routine to copy, \"all\" runs all available routines"),
        OPT_INTEGER('i', "iterations", &iterations,
                    "repeat memcpy() invocation this number of times"),
        OPT_BOOLEAN('c', "cycle", &use_cycle,
@@ -135,55 +135,16 @@ struct bench_mem_info {
        const char *const *usage;
 };
 
-static int bench_mem_common(int argc, const char **argv,
-                    const char *prefix __maybe_unused,
-                    struct bench_mem_info *info)
+static void __bench_mem_routine(struct bench_mem_info *info, int r_idx, size_t len, double totallen)
 {
-       int i;
-       size_t len;
-       double totallen;
+       const struct routine *r = &info->routines[r_idx];
        double result_bps[2];
        u64 result_cycle[2];
 
-       argc = parse_options(argc, argv, options,
-                            info->usage, 0);
-
-       if (no_prefault && only_prefault) {
-               fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n");
-               return 1;
-       }
-
-       if (use_cycle)
-               init_cycle();
-
-       len = (size_t)perf_atoll((char *)length_str);
-       totallen = (double)len * iterations;
-
        result_cycle[0] = result_cycle[1] = 0ULL;
        result_bps[0] = result_bps[1] = 0.0;
 
-       if ((s64)len <= 0) {
-               fprintf(stderr, "Invalid length:%s\n", length_str);
-               return 1;
-       }
-
-       /* same to without specifying either of prefault and no-prefault */
-       if (only_prefault && no_prefault)
-               only_prefault = no_prefault = false;
-
-       for (i = 0; info->routines[i].name; i++) {
-               if (!strcmp(info->routines[i].name, routine))
-                       break;
-       }
-       if (!info->routines[i].name) {
-               printf("Unknown routine:%s\n", routine);
-               printf("Available routines...\n");
-               for (i = 0; info->routines[i].name; i++) {
-                       printf("\t%s ... %s\n",
-                              info->routines[i].name, info->routines[i].desc);
-               }
-               return 1;
-       }
+       printf("Routine %s (%s)\n", r->name, r->desc);
 
        if (bench_format == BENCH_FORMAT_DEFAULT)
                printf("# Copying %s Bytes ...\n\n", length_str);
@@ -191,28 +152,17 @@ static int bench_mem_common(int argc, const char **argv,
        if (!only_prefault && !no_prefault) {
                /* show both of results */
                if (use_cycle) {
-                       result_cycle[0] =
-                               info->do_cycle(&info->routines[i], len, false);
-                       result_cycle[1] =
-                               info->do_cycle(&info->routines[i], len, true);
+                       result_cycle[0] = info->do_cycle(r, len, false);
+                       result_cycle[1] = info->do_cycle(r, len, true);
                } else {
-                       result_bps[0] =
-                               info->do_gettimeofday(&info->routines[i],
-                                               len, false);
-                       result_bps[1] =
-                               info->do_gettimeofday(&info->routines[i],
-                                               len, true);
+                       result_bps[0]   = info->do_gettimeofday(r, len, false);
+                       result_bps[1]   = info->do_gettimeofday(r, len, true);
                }
        } else {
-               if (use_cycle) {
-                       result_cycle[pf] =
-                               info->do_cycle(&info->routines[i],
-                                               len, only_prefault);
-               } else {
-                       result_bps[pf] =
-                               info->do_gettimeofday(&info->routines[i],
-                                               len, only_prefault);
-               }
+               if (use_cycle)
+                       result_cycle[pf] = info->do_cycle(r, len, only_prefault);
+               else
+                       result_bps[pf] = info->do_gettimeofday(r, len, only_prefault);
        }
 
        switch (bench_format) {
@@ -265,6 +215,60 @@ static int bench_mem_common(int argc, const char **argv,
                die("unknown format: %d\n", bench_format);
                break;
        }
+}
+
+static int bench_mem_common(int argc, const char **argv,
+                    const char *prefix __maybe_unused,
+                    struct bench_mem_info *info)
+{
+       int i;
+       size_t len;
+       double totallen;
+
+       argc = parse_options(argc, argv, options,
+                            info->usage, 0);
+
+       if (no_prefault && only_prefault) {
+               fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n");
+               return 1;
+       }
+
+       if (use_cycle)
+               init_cycle();
+
+       len = (size_t)perf_atoll((char *)length_str);
+       totallen = (double)len * iterations;
+
+       if ((s64)len <= 0) {
+               fprintf(stderr, "Invalid length:%s\n", length_str);
+               return 1;
+       }
+
+       /* same to without specifying either of prefault and no-prefault */
+       if (only_prefault && no_prefault)
+               only_prefault = no_prefault = false;
+
+       if (!strncmp(routine, "all", 3)) {
+               for (i = 0; info->routines[i].name; i++)
+                       __bench_mem_routine(info, i, len, totallen);
+               return 0;
+       }
+
+       for (i = 0; info->routines[i].name; i++) {
+               if (!strcmp(info->routines[i].name, routine))
+                       break;
+       }
+       if (!info->routines[i].name) {
+               printf("Unknown routine:%s\n", routine);
+               printf("Available routines...\n");
+               for (i = 0; info->routines[i].name; i++) {
+                       printf("\t%s ... %s\n",
+                              info->routines[i].name, info->routines[i].desc);
+               }
+               return 1;
+       }
+
+       __bench_mem_routine(info, i, len, totallen);
 
        return 0;
 }
index a71dff97c1f54e87aea5034398534bacc77d6a56..f02d028771d970da5d611c31d1b406179b450d45 100644 (file)
@@ -1,12 +1,12 @@
 
-MEMSET_FN(__memset,
+MEMSET_FN(memset_orig,
        "x86-64-unrolled",
        "unrolled memset() in arch/x86/lib/memset_64.S")
 
-MEMSET_FN(memset_c,
+MEMSET_FN(__memset,
        "x86-64-stosq",
        "movsq-based memset() in arch/x86/lib/memset_64.S")
 
-MEMSET_FN(memset_c_e,
+MEMSET_FN(memset_erms,
        "x86-64-stosb",
        "movsb-based memset() in arch/x86/lib/memset_64.S")
index 9e5af89ed13af64bed89e46e7a8df1e1b6a52298..de278784c866a3804040408454f38c4a4f7ca44e 100644 (file)
@@ -1,8 +1,6 @@
 #define memset MEMSET /* don't hide glibc's memset() */
 #define altinstr_replacement text
 #define globl p2align 4; .globl
-#define Lmemset_c globl memset_c; memset_c
-#define Lmemset_c_e globl memset_c_e; memset_c_e
 #include "../../../arch/x86/lib/memset_64.S"
 
 /*
index 6789d788d4947d31f890ca56660d8ef968901b82..3a3a0f16456ae3369cd73faff1d3af9e4e8d4180 100644 (file)
@@ -4,5 +4,6 @@
 /* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */
 
 #define altinstruction_entry #
+#define ALTERNATIVE_2 #
 
 #endif