because this shows that you did think about these issues wrt. your
device.
-The query is performed via a call to dma_set_mask():
+The query is performed via a call to dma_set_mask_and_coherent():
- int dma_set_mask(struct device *dev, u64 mask);
+ int dma_set_mask_and_coherent(struct device *dev, u64 mask);
-The query for consistent allocations is performed via a call to
-dma_set_coherent_mask():
+which will query the mask for both streaming and coherent APIs together.
+If you have some special requirements, then the following two separate
+queries can be used instead:
- int dma_set_coherent_mask(struct device *dev, u64 mask);
+ The query for streaming mappings is performed via a call to
+ dma_set_mask():
+
+ int dma_set_mask(struct device *dev, u64 mask);
+
+ The query for consistent allocations is performed via a call
+ to dma_set_coherent_mask():
+
+ int dma_set_coherent_mask(struct device *dev, u64 mask);
Here, dev is a pointer to the device struct of your device, and mask
is a bit mask describing which bits of an address your device
The standard 32-bit addressing device would do something like this:
- if (dma_set_mask(dev, DMA_BIT_MASK(32))) {
+ if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32))) {
printk(KERN_WARNING
"mydev: No suitable DMA available.\n");
goto ignore_this_device;
int using_dac, consistent_using_dac;
- if (!dma_set_mask(dev, DMA_BIT_MASK(64))) {
+ if (!dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64))) {
using_dac = 1;
consistent_using_dac = 1;
- dma_set_coherent_mask(dev, DMA_BIT_MASK(64));
- } else if (!dma_set_mask(dev, DMA_BIT_MASK(32))) {
+ } else if (!dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32))) {
using_dac = 0;
consistent_using_dac = 0;
- dma_set_coherent_mask(dev, DMA_BIT_MASK(32));
} else {
printk(KERN_WARNING
"mydev: No suitable DMA available.\n");
goto ignore_this_device;
}
-dma_set_coherent_mask() will always be able to set the same or a
-smaller mask as dma_set_mask(). However for the rare case that a
+The coherent coherent mask will always be able to set the same or a
+smaller mask as the streaming mask. However for the rare case that a
device driver only uses consistent allocations, one would have to
check the return value from dma_set_coherent_mask().
goto ignore_this_device;
}
-When dma_set_mask() is successful, and returns zero, the kernel saves
-away this mask you have provided. The kernel will use this
-information later when you make DMA mappings.
+When dma_set_mask() or dma_set_mask_and_coherent() is successful, and
+returns zero, the kernel saves away this mask you have provided. The
+kernel will use this information later when you make DMA mappings.
There is a case which we are aware of at this time, which is worth
mentioning in this documentation. If your device supports multiple
internal API for use by the platform than an external API for use by
driver writers.
+int
+dma_set_mask_and_coherent(struct device *dev, u64 mask)
+
+Checks to see if the mask is possible and updates the device
+streaming and coherent DMA mask parameters if it is.
+
+Returns: 0 if successful and a negative error if not.
+
int
dma_set_mask(struct device *dev, u64 mask)
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAVE_CUSTOM_GPIO_H
+ select ARCH_USE_CMPXCHG_LOCKREF
select ARCH_WANT_IPC_PARSE_VERSION
select BUILDTIME_EXTABLE_SORT if MMU
select CLONE_BACKWARDS
select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND
select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
select HAVE_PERF_EVENTS
+ select HAVE_PERF_REGS
+ select HAVE_PERF_USER_STACK_DUMP
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UID16
select GENERIC_CLOCKEVENTS
select HAVE_IDE
select ISA
- select NEED_MACH_GPIO_H
select NEED_MACH_MEMORY_H
select SPARSE_IRQ
help
for (multi-)cluster based systems, such as big.LITTLE based
systems.
+config BIG_LITTLE
+ bool "big.LITTLE support (Experimental)"
+ depends on CPU_V7 && SMP
+ select MCPM
+ help
+ This option enables support selections for the big.LITTLE
+ system architecture.
+
+config BL_SWITCHER
+ bool "big.LITTLE switcher support"
+ depends on BIG_LITTLE && MCPM && HOTPLUG_CPU
+ select CPU_PM
+ select ARM_CPU_SUSPEND
+ help
+ The big.LITTLE "switcher" provides the core functionality to
+ transparently handle transition between a cluster of A15's
+ and a cluster of A7's in a big.LITTLE system.
+
+config BL_SWITCHER_DUMMY_IF
+ tristate "Simple big.LITTLE switcher user interface"
+ depends on BL_SWITCHER && DEBUG_KERNEL
+ help
+ This is a simple and dummy char dev interface to control
+ the big.LITTLE switcher core code. It is meant for
+ debugging purposes only.
+
choice
prompt "Memory split"
default VMSPLIT_3G
options; the platform specific options are deprecated
and will be soon removed.
+ config DEBUG_LL_UART_EFM32
+ bool "Kernel low-level debugging via efm32 UART"
+ depends on ARCH_EFM32
+ help
+ Say Y here if you want the debug print routines to direct
+ their output to an UART or USART port on efm32 based
+ machines. Use the following addresses for DEBUG_UART_PHYS:
+
+ 0x4000c000 | USART0
+ 0x4000c400 | USART1
+ 0x4000c800 | USART2
+ 0x4000e000 | UART0
+ 0x4000e400 | UART1
+
config DEBUG_LL_UART_PL01X
bool "Kernel low-level debugging via ARM Ltd PL01x Primecell UART"
help
default "debug/8250.S" if DEBUG_LL_UART_8250 || DEBUG_UART_8250
default "debug/pl01x.S" if DEBUG_LL_UART_PL01X || DEBUG_UART_PL01X
default "debug/exynos.S" if DEBUG_EXYNOS_UART
+ default "debug/efm32.S" if DEBUG_LL_UART_EFM32
default "debug/icedcc.S" if DEBUG_ICEDCC
default "debug/imx.S" if DEBUG_IMX1_UART || \
DEBUG_IMX25_UART || \
default 0x20064000 if DEBUG_RK29_UART1 || DEBUG_RK3X_UART2
default 0x20068000 if DEBUG_RK29_UART2 || DEBUG_RK3X_UART3
default 0x20201000 if DEBUG_BCM2835
+ default 0x4000e400 if DEBUG_LL_UART_EFM32
default 0x40090000 if ARCH_LPC32XX
default 0x40100000 if DEBUG_PXA_UART1
default 0x42000000 if ARCH_GEMINI
default 0xfff36000 if DEBUG_HIGHBANK_UART
default 0xfffff700 if ARCH_IOP33X
depends on DEBUG_LL_UART_8250 || DEBUG_LL_UART_PL01X || \
+ DEBUG_LL_UART_EFM32 || \
DEBUG_UART_8250 || DEBUG_UART_PL01X
config DEBUG_UART_VIRT
AFLAGS_mcpm_head.o := -march=armv7-a
AFLAGS_vlock.o := -march=armv7-a
obj-$(CONFIG_TI_PRIV_EDMA) += edma.o
+obj-$(CONFIG_BL_SWITCHER) += bL_switcher.o
+obj-$(CONFIG_BL_SWITCHER_DUMMY_IF) += bL_switcher_dummy_if.o
--- /dev/null
+/*
+ * arch/arm/common/bL_switcher.c -- big.LITTLE cluster switcher core driver
+ *
+ * Created by: Nicolas Pitre, March 2012
+ * Copyright: (C) 2012-2013 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/atomic.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/cpu_pm.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/time.h>
+#include <linux/clockchips.h>
+#include <linux/hrtimer.h>
+#include <linux/tick.h>
+#include <linux/notifier.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/irqchip/arm-gic.h>
+#include <linux/moduleparam.h>
+
+#include <asm/smp_plat.h>
+#include <asm/cputype.h>
+#include <asm/suspend.h>
+#include <asm/mcpm.h>
+#include <asm/bL_switcher.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/power_cpu_migrate.h>
+
+
+/*
+ * Use our own MPIDR accessors as the generic ones in asm/cputype.h have
+ * __attribute_const__ and we don't want the compiler to assume any
+ * constness here as the value _does_ change along some code paths.
+ */
+
+static int read_mpidr(void)
+{
+ unsigned int id;
+ asm volatile ("mrc p15, 0, %0, c0, c0, 5" : "=r" (id));
+ return id & MPIDR_HWID_BITMASK;
+}
+
+/*
+ * Get a global nanosecond time stamp for tracing.
+ */
+static s64 get_ns(void)
+{
+ struct timespec ts;
+ getnstimeofday(&ts);
+ return timespec_to_ns(&ts);
+}
+
+/*
+ * bL switcher core code.
+ */
+
+static void bL_do_switch(void *_arg)
+{
+ unsigned ib_mpidr, ib_cpu, ib_cluster;
+ long volatile handshake, **handshake_ptr = _arg;
+
+ pr_debug("%s\n", __func__);
+
+ ib_mpidr = cpu_logical_map(smp_processor_id());
+ ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0);
+ ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1);
+
+ /* Advertise our handshake location */
+ if (handshake_ptr) {
+ handshake = 0;
+ *handshake_ptr = &handshake;
+ } else
+ handshake = -1;
+
+ /*
+ * Our state has been saved at this point. Let's release our
+ * inbound CPU.
+ */
+ mcpm_set_entry_vector(ib_cpu, ib_cluster, cpu_resume);
+ sev();
+
+ /*
+ * From this point, we must assume that our counterpart CPU might
+ * have taken over in its parallel world already, as if execution
+ * just returned from cpu_suspend(). It is therefore important to
+ * be very careful not to make any change the other guy is not
+ * expecting. This is why we need stack isolation.
+ *
+ * Fancy under cover tasks could be performed here. For now
+ * we have none.
+ */
+
+ /*
+ * Let's wait until our inbound is alive.
+ */
+ while (!handshake) {
+ wfe();
+ smp_mb();
+ }
+
+ /* Let's put ourself down. */
+ mcpm_cpu_power_down();
+
+ /* should never get here */
+ BUG();
+}
+
+/*
+ * Stack isolation. To ensure 'current' remains valid, we just use another
+ * piece of our thread's stack space which should be fairly lightly used.
+ * The selected area starts just above the thread_info structure located
+ * at the very bottom of the stack, aligned to a cache line, and indexed
+ * with the cluster number.
+ */
+#define STACK_SIZE 512
+extern void call_with_stack(void (*fn)(void *), void *arg, void *sp);
+static int bL_switchpoint(unsigned long _arg)
+{
+ unsigned int mpidr = read_mpidr();
+ unsigned int clusterid = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+ void *stack = current_thread_info() + 1;
+ stack = PTR_ALIGN(stack, L1_CACHE_BYTES);
+ stack += clusterid * STACK_SIZE + STACK_SIZE;
+ call_with_stack(bL_do_switch, (void *)_arg, stack);
+ BUG();
+}
+
+/*
+ * Generic switcher interface
+ */
+
+static unsigned int bL_gic_id[MAX_CPUS_PER_CLUSTER][MAX_NR_CLUSTERS];
+static int bL_switcher_cpu_pairing[NR_CPUS];
+
+/*
+ * bL_switch_to - Switch to a specific cluster for the current CPU
+ * @new_cluster_id: the ID of the cluster to switch to.
+ *
+ * This function must be called on the CPU to be switched.
+ * Returns 0 on success, else a negative status code.
+ */
+static int bL_switch_to(unsigned int new_cluster_id)
+{
+ unsigned int mpidr, this_cpu, that_cpu;
+ unsigned int ob_mpidr, ob_cpu, ob_cluster, ib_mpidr, ib_cpu, ib_cluster;
+ struct completion inbound_alive;
+ struct tick_device *tdev;
+ enum clock_event_mode tdev_mode;
+ long volatile *handshake_ptr;
+ int ipi_nr, ret;
+
+ this_cpu = smp_processor_id();
+ ob_mpidr = read_mpidr();
+ ob_cpu = MPIDR_AFFINITY_LEVEL(ob_mpidr, 0);
+ ob_cluster = MPIDR_AFFINITY_LEVEL(ob_mpidr, 1);
+ BUG_ON(cpu_logical_map(this_cpu) != ob_mpidr);
+
+ if (new_cluster_id == ob_cluster)
+ return 0;
+
+ that_cpu = bL_switcher_cpu_pairing[this_cpu];
+ ib_mpidr = cpu_logical_map(that_cpu);
+ ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0);
+ ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1);
+
+ pr_debug("before switch: CPU %d MPIDR %#x -> %#x\n",
+ this_cpu, ob_mpidr, ib_mpidr);
+
+ this_cpu = smp_processor_id();
+
+ /* Close the gate for our entry vectors */
+ mcpm_set_entry_vector(ob_cpu, ob_cluster, NULL);
+ mcpm_set_entry_vector(ib_cpu, ib_cluster, NULL);
+
+ /* Install our "inbound alive" notifier. */
+ init_completion(&inbound_alive);
+ ipi_nr = register_ipi_completion(&inbound_alive, this_cpu);
+ ipi_nr |= ((1 << 16) << bL_gic_id[ob_cpu][ob_cluster]);
+ mcpm_set_early_poke(ib_cpu, ib_cluster, gic_get_sgir_physaddr(), ipi_nr);
+
+ /*
+ * Let's wake up the inbound CPU now in case it requires some delay
+ * to come online, but leave it gated in our entry vector code.
+ */
+ ret = mcpm_cpu_power_up(ib_cpu, ib_cluster);
+ if (ret) {
+ pr_err("%s: mcpm_cpu_power_up() returned %d\n", __func__, ret);
+ return ret;
+ }
+
+ /*
+ * Raise a SGI on the inbound CPU to make sure it doesn't stall
+ * in a possible WFI, such as in bL_power_down().
+ */
+ gic_send_sgi(bL_gic_id[ib_cpu][ib_cluster], 0);
+
+ /*
+ * Wait for the inbound to come up. This allows for other
+ * tasks to be scheduled in the mean time.
+ */
+ wait_for_completion(&inbound_alive);
+ mcpm_set_early_poke(ib_cpu, ib_cluster, 0, 0);
+
+ /*
+ * From this point we are entering the switch critical zone
+ * and can't take any interrupts anymore.
+ */
+ local_irq_disable();
+ local_fiq_disable();
+ trace_cpu_migrate_begin(get_ns(), ob_mpidr);
+
+ /* redirect GIC's SGIs to our counterpart */
+ gic_migrate_target(bL_gic_id[ib_cpu][ib_cluster]);
+
+ tdev = tick_get_device(this_cpu);
+ if (tdev && !cpumask_equal(tdev->evtdev->cpumask, cpumask_of(this_cpu)))
+ tdev = NULL;
+ if (tdev) {
+ tdev_mode = tdev->evtdev->mode;
+ clockevents_set_mode(tdev->evtdev, CLOCK_EVT_MODE_SHUTDOWN);
+ }
+
+ ret = cpu_pm_enter();
+
+ /* we can not tolerate errors at this point */
+ if (ret)
+ panic("%s: cpu_pm_enter() returned %d\n", __func__, ret);
+
+ /* Swap the physical CPUs in the logical map for this logical CPU. */
+ cpu_logical_map(this_cpu) = ib_mpidr;
+ cpu_logical_map(that_cpu) = ob_mpidr;
+
+ /* Let's do the actual CPU switch. */
+ ret = cpu_suspend((unsigned long)&handshake_ptr, bL_switchpoint);
+ if (ret > 0)
+ panic("%s: cpu_suspend() returned %d\n", __func__, ret);
+
+ /* We are executing on the inbound CPU at this point */
+ mpidr = read_mpidr();
+ pr_debug("after switch: CPU %d MPIDR %#x\n", this_cpu, mpidr);
+ BUG_ON(mpidr != ib_mpidr);
+
+ mcpm_cpu_powered_up();
+
+ ret = cpu_pm_exit();
+
+ if (tdev) {
+ clockevents_set_mode(tdev->evtdev, tdev_mode);
+ clockevents_program_event(tdev->evtdev,
+ tdev->evtdev->next_event, 1);
+ }
+
+ trace_cpu_migrate_finish(get_ns(), ib_mpidr);
+ local_fiq_enable();
+ local_irq_enable();
+
+ *handshake_ptr = 1;
+ dsb_sev();
+
+ if (ret)
+ pr_err("%s exiting with error %d\n", __func__, ret);
+ return ret;
+}
+
+struct bL_thread {
+ spinlock_t lock;
+ struct task_struct *task;
+ wait_queue_head_t wq;
+ int wanted_cluster;
+ struct completion started;
+ bL_switch_completion_handler completer;
+ void *completer_cookie;
+};
+
+static struct bL_thread bL_threads[NR_CPUS];
+
+static int bL_switcher_thread(void *arg)
+{
+ struct bL_thread *t = arg;
+ struct sched_param param = { .sched_priority = 1 };
+ int cluster;
+ bL_switch_completion_handler completer;
+ void *completer_cookie;
+
+ sched_setscheduler_nocheck(current, SCHED_FIFO, ¶m);
+ complete(&t->started);
+
+ do {
+ if (signal_pending(current))
+ flush_signals(current);
+ wait_event_interruptible(t->wq,
+ t->wanted_cluster != -1 ||
+ kthread_should_stop());
+
+ spin_lock(&t->lock);
+ cluster = t->wanted_cluster;
+ completer = t->completer;
+ completer_cookie = t->completer_cookie;
+ t->wanted_cluster = -1;
+ t->completer = NULL;
+ spin_unlock(&t->lock);
+
+ if (cluster != -1) {
+ bL_switch_to(cluster);
+
+ if (completer)
+ completer(completer_cookie);
+ }
+ } while (!kthread_should_stop());
+
+ return 0;
+}
+
+static struct task_struct *bL_switcher_thread_create(int cpu, void *arg)
+{
+ struct task_struct *task;
+
+ task = kthread_create_on_node(bL_switcher_thread, arg,
+ cpu_to_node(cpu), "kswitcher_%d", cpu);
+ if (!IS_ERR(task)) {
+ kthread_bind(task, cpu);
+ wake_up_process(task);
+ } else
+ pr_err("%s failed for CPU %d\n", __func__, cpu);
+ return task;
+}
+
+/*
+ * bL_switch_request_cb - Switch to a specific cluster for the given CPU,
+ * with completion notification via a callback
+ *
+ * @cpu: the CPU to switch
+ * @new_cluster_id: the ID of the cluster to switch to.
+ * @completer: switch completion callback. if non-NULL,
+ * @completer(@completer_cookie) will be called on completion of
+ * the switch, in non-atomic context.
+ * @completer_cookie: opaque context argument for @completer.
+ *
+ * This function causes a cluster switch on the given CPU by waking up
+ * the appropriate switcher thread. This function may or may not return
+ * before the switch has occurred.
+ *
+ * If a @completer callback function is supplied, it will be called when
+ * the switch is complete. This can be used to determine asynchronously
+ * when the switch is complete, regardless of when bL_switch_request()
+ * returns. When @completer is supplied, no new switch request is permitted
+ * for the affected CPU until after the switch is complete, and @completer
+ * has returned.
+ */
+int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id,
+ bL_switch_completion_handler completer,
+ void *completer_cookie)
+{
+ struct bL_thread *t;
+
+ if (cpu >= ARRAY_SIZE(bL_threads)) {
+ pr_err("%s: cpu %d out of bounds\n", __func__, cpu);
+ return -EINVAL;
+ }
+
+ t = &bL_threads[cpu];
+
+ if (IS_ERR(t->task))
+ return PTR_ERR(t->task);
+ if (!t->task)
+ return -ESRCH;
+
+ spin_lock(&t->lock);
+ if (t->completer) {
+ spin_unlock(&t->lock);
+ return -EBUSY;
+ }
+ t->completer = completer;
+ t->completer_cookie = completer_cookie;
+ t->wanted_cluster = new_cluster_id;
+ spin_unlock(&t->lock);
+ wake_up(&t->wq);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bL_switch_request_cb);
+
+/*
+ * Activation and configuration code.
+ */
+
+static DEFINE_MUTEX(bL_switcher_activation_lock);
+static BLOCKING_NOTIFIER_HEAD(bL_activation_notifier);
+static unsigned int bL_switcher_active;
+static unsigned int bL_switcher_cpu_original_cluster[NR_CPUS];
+static cpumask_t bL_switcher_removed_logical_cpus;
+
+int bL_switcher_register_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&bL_activation_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(bL_switcher_register_notifier);
+
+int bL_switcher_unregister_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&bL_activation_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(bL_switcher_unregister_notifier);
+
+static int bL_activation_notify(unsigned long val)
+{
+ int ret;
+
+ ret = blocking_notifier_call_chain(&bL_activation_notifier, val, NULL);
+ if (ret & NOTIFY_STOP_MASK)
+ pr_err("%s: notifier chain failed with status 0x%x\n",
+ __func__, ret);
+ return notifier_to_errno(ret);
+}
+
+static void bL_switcher_restore_cpus(void)
+{
+ int i;
+
+ for_each_cpu(i, &bL_switcher_removed_logical_cpus)
+ cpu_up(i);
+}
+
+static int bL_switcher_halve_cpus(void)
+{
+ int i, j, cluster_0, gic_id, ret;
+ unsigned int cpu, cluster, mask;
+ cpumask_t available_cpus;
+
+ /* First pass to validate what we have */
+ mask = 0;
+ for_each_online_cpu(i) {
+ cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0);
+ cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
+ if (cluster >= 2) {
+ pr_err("%s: only dual cluster systems are supported\n", __func__);
+ return -EINVAL;
+ }
+ if (WARN_ON(cpu >= MAX_CPUS_PER_CLUSTER))
+ return -EINVAL;
+ mask |= (1 << cluster);
+ }
+ if (mask != 3) {
+ pr_err("%s: no CPU pairing possible\n", __func__);
+ return -EINVAL;
+ }
+
+ /*
+ * Now let's do the pairing. We match each CPU with another CPU
+ * from a different cluster. To get a uniform scheduling behavior
+ * without fiddling with CPU topology and compute capacity data,
+ * we'll use logical CPUs initially belonging to the same cluster.
+ */
+ memset(bL_switcher_cpu_pairing, -1, sizeof(bL_switcher_cpu_pairing));
+ cpumask_copy(&available_cpus, cpu_online_mask);
+ cluster_0 = -1;
+ for_each_cpu(i, &available_cpus) {
+ int match = -1;
+ cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
+ if (cluster_0 == -1)
+ cluster_0 = cluster;
+ if (cluster != cluster_0)
+ continue;
+ cpumask_clear_cpu(i, &available_cpus);
+ for_each_cpu(j, &available_cpus) {
+ cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(j), 1);
+ /*
+ * Let's remember the last match to create "odd"
+ * pairings on purpose in order for other code not
+ * to assume any relation between physical and
+ * logical CPU numbers.
+ */
+ if (cluster != cluster_0)
+ match = j;
+ }
+ if (match != -1) {
+ bL_switcher_cpu_pairing[i] = match;
+ cpumask_clear_cpu(match, &available_cpus);
+ pr_info("CPU%d paired with CPU%d\n", i, match);
+ }
+ }
+
+ /*
+ * Now we disable the unwanted CPUs i.e. everything that has no
+ * pairing information (that includes the pairing counterparts).
+ */
+ cpumask_clear(&bL_switcher_removed_logical_cpus);
+ for_each_online_cpu(i) {
+ cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0);
+ cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
+
+ /* Let's take note of the GIC ID for this CPU */
+ gic_id = gic_get_cpu_id(i);
+ if (gic_id < 0) {
+ pr_err("%s: bad GIC ID for CPU %d\n", __func__, i);
+ bL_switcher_restore_cpus();
+ return -EINVAL;
+ }
+ bL_gic_id[cpu][cluster] = gic_id;
+ pr_info("GIC ID for CPU %u cluster %u is %u\n",
+ cpu, cluster, gic_id);
+
+ if (bL_switcher_cpu_pairing[i] != -1) {
+ bL_switcher_cpu_original_cluster[i] = cluster;
+ continue;
+ }
+
+ ret = cpu_down(i);
+ if (ret) {
+ bL_switcher_restore_cpus();
+ return ret;
+ }
+ cpumask_set_cpu(i, &bL_switcher_removed_logical_cpus);
+ }
+
+ return 0;
+}
+
+/* Determine the logical CPU a given physical CPU is grouped on. */
+int bL_switcher_get_logical_index(u32 mpidr)
+{
+ int cpu;
+
+ if (!bL_switcher_active)
+ return -EUNATCH;
+
+ mpidr &= MPIDR_HWID_BITMASK;
+ for_each_online_cpu(cpu) {
+ int pairing = bL_switcher_cpu_pairing[cpu];
+ if (pairing == -1)
+ continue;
+ if ((mpidr == cpu_logical_map(cpu)) ||
+ (mpidr == cpu_logical_map(pairing)))
+ return cpu;
+ }
+ return -EINVAL;
+}
+
+static void bL_switcher_trace_trigger_cpu(void *__always_unused info)
+{
+ trace_cpu_migrate_current(get_ns(), read_mpidr());
+}
+
+int bL_switcher_trace_trigger(void)
+{
+ int ret;
+
+ preempt_disable();
+
+ bL_switcher_trace_trigger_cpu(NULL);
+ ret = smp_call_function(bL_switcher_trace_trigger_cpu, NULL, true);
+
+ preempt_enable();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(bL_switcher_trace_trigger);
+
+static int bL_switcher_enable(void)
+{
+ int cpu, ret;
+
+ mutex_lock(&bL_switcher_activation_lock);
+ cpu_hotplug_driver_lock();
+ if (bL_switcher_active) {
+ cpu_hotplug_driver_unlock();
+ mutex_unlock(&bL_switcher_activation_lock);
+ return 0;
+ }
+
+ pr_info("big.LITTLE switcher initializing\n");
+
+ ret = bL_activation_notify(BL_NOTIFY_PRE_ENABLE);
+ if (ret)
+ goto error;
+
+ ret = bL_switcher_halve_cpus();
+ if (ret)
+ goto error;
+
+ bL_switcher_trace_trigger();
+
+ for_each_online_cpu(cpu) {
+ struct bL_thread *t = &bL_threads[cpu];
+ spin_lock_init(&t->lock);
+ init_waitqueue_head(&t->wq);
+ init_completion(&t->started);
+ t->wanted_cluster = -1;
+ t->task = bL_switcher_thread_create(cpu, t);
+ }
+
+ bL_switcher_active = 1;
+ bL_activation_notify(BL_NOTIFY_POST_ENABLE);
+ pr_info("big.LITTLE switcher initialized\n");
+ goto out;
+
+error:
+ pr_warn("big.LITTLE switcher initialization failed\n");
+ bL_activation_notify(BL_NOTIFY_POST_DISABLE);
+
+out:
+ cpu_hotplug_driver_unlock();
+ mutex_unlock(&bL_switcher_activation_lock);
+ return ret;
+}
+
+#ifdef CONFIG_SYSFS
+
+static void bL_switcher_disable(void)
+{
+ unsigned int cpu, cluster;
+ struct bL_thread *t;
+ struct task_struct *task;
+
+ mutex_lock(&bL_switcher_activation_lock);
+ cpu_hotplug_driver_lock();
+
+ if (!bL_switcher_active)
+ goto out;
+
+ if (bL_activation_notify(BL_NOTIFY_PRE_DISABLE) != 0) {
+ bL_activation_notify(BL_NOTIFY_POST_ENABLE);
+ goto out;
+ }
+
+ bL_switcher_active = 0;
+
+ /*
+ * To deactivate the switcher, we must shut down the switcher
+ * threads to prevent any other requests from being accepted.
+ * Then, if the final cluster for given logical CPU is not the
+ * same as the original one, we'll recreate a switcher thread
+ * just for the purpose of switching the CPU back without any
+ * possibility for interference from external requests.
+ */
+ for_each_online_cpu(cpu) {
+ t = &bL_threads[cpu];
+ task = t->task;
+ t->task = NULL;
+ if (!task || IS_ERR(task))
+ continue;
+ kthread_stop(task);
+ /* no more switch may happen on this CPU at this point */
+ cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1);
+ if (cluster == bL_switcher_cpu_original_cluster[cpu])
+ continue;
+ init_completion(&t->started);
+ t->wanted_cluster = bL_switcher_cpu_original_cluster[cpu];
+ task = bL_switcher_thread_create(cpu, t);
+ if (!IS_ERR(task)) {
+ wait_for_completion(&t->started);
+ kthread_stop(task);
+ cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1);
+ if (cluster == bL_switcher_cpu_original_cluster[cpu])
+ continue;
+ }
+ /* If execution gets here, we're in trouble. */
+ pr_crit("%s: unable to restore original cluster for CPU %d\n",
+ __func__, cpu);
+ pr_crit("%s: CPU %d can't be restored\n",
+ __func__, bL_switcher_cpu_pairing[cpu]);
+ cpumask_clear_cpu(bL_switcher_cpu_pairing[cpu],
+ &bL_switcher_removed_logical_cpus);
+ }
+
+ bL_switcher_restore_cpus();
+ bL_switcher_trace_trigger();
+
+ bL_activation_notify(BL_NOTIFY_POST_DISABLE);
+
+out:
+ cpu_hotplug_driver_unlock();
+ mutex_unlock(&bL_switcher_activation_lock);
+}
+
+static ssize_t bL_switcher_active_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", bL_switcher_active);
+}
+
+static ssize_t bL_switcher_active_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ int ret;
+
+ switch (buf[0]) {
+ case '0':
+ bL_switcher_disable();
+ ret = 0;
+ break;
+ case '1':
+ ret = bL_switcher_enable();
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ return (ret >= 0) ? count : ret;
+}
+
+static ssize_t bL_switcher_trace_trigger_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ int ret = bL_switcher_trace_trigger();
+
+ return ret ? ret : count;
+}
+
+static struct kobj_attribute bL_switcher_active_attr =
+ __ATTR(active, 0644, bL_switcher_active_show, bL_switcher_active_store);
+
+static struct kobj_attribute bL_switcher_trace_trigger_attr =
+ __ATTR(trace_trigger, 0200, NULL, bL_switcher_trace_trigger_store);
+
+static struct attribute *bL_switcher_attrs[] = {
+ &bL_switcher_active_attr.attr,
+ &bL_switcher_trace_trigger_attr.attr,
+ NULL,
+};
+
+static struct attribute_group bL_switcher_attr_group = {
+ .attrs = bL_switcher_attrs,
+};
+
+static struct kobject *bL_switcher_kobj;
+
+static int __init bL_switcher_sysfs_init(void)
+{
+ int ret;
+
+ bL_switcher_kobj = kobject_create_and_add("bL_switcher", kernel_kobj);
+ if (!bL_switcher_kobj)
+ return -ENOMEM;
+ ret = sysfs_create_group(bL_switcher_kobj, &bL_switcher_attr_group);
+ if (ret)
+ kobject_put(bL_switcher_kobj);
+ return ret;
+}
+
+#endif /* CONFIG_SYSFS */
+
+bool bL_switcher_get_enabled(void)
+{
+ mutex_lock(&bL_switcher_activation_lock);
+
+ return bL_switcher_active;
+}
+EXPORT_SYMBOL_GPL(bL_switcher_get_enabled);
+
+void bL_switcher_put_enabled(void)
+{
+ mutex_unlock(&bL_switcher_activation_lock);
+}
+EXPORT_SYMBOL_GPL(bL_switcher_put_enabled);
+
+/*
+ * Veto any CPU hotplug operation on those CPUs we've removed
+ * while the switcher is active.
+ * We're just not ready to deal with that given the trickery involved.
+ */
+static int bL_switcher_hotplug_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ if (bL_switcher_active) {
+ int pairing = bL_switcher_cpu_pairing[(unsigned long)hcpu];
+ switch (action & 0xf) {
+ case CPU_UP_PREPARE:
+ case CPU_DOWN_PREPARE:
+ if (pairing == -1)
+ return NOTIFY_BAD;
+ }
+ }
+ return NOTIFY_DONE;
+}
+
+static bool no_bL_switcher;
+core_param(no_bL_switcher, no_bL_switcher, bool, 0644);
+
+static int __init bL_switcher_init(void)
+{
+ int ret;
+
+ if (MAX_NR_CLUSTERS != 2) {
+ pr_err("%s: only dual cluster systems are supported\n", __func__);
+ return -EINVAL;
+ }
+
+ cpu_notifier(bL_switcher_hotplug_callback, 0);
+
+ if (!no_bL_switcher) {
+ ret = bL_switcher_enable();
+ if (ret)
+ return ret;
+ }
+
+#ifdef CONFIG_SYSFS
+ ret = bL_switcher_sysfs_init();
+ if (ret)
+ pr_err("%s: unable to create sysfs entry\n", __func__);
+#endif
+
+ return 0;
+}
+
+late_initcall(bL_switcher_init);
--- /dev/null
+/*
+ * arch/arm/common/bL_switcher_dummy_if.c -- b.L switcher dummy interface
+ *
+ * Created by: Nicolas Pitre, November 2012
+ * Copyright: (C) 2012-2013 Linaro Limited
+ *
+ * Dummy interface to user space for debugging purpose only.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <asm/uaccess.h>
+#include <asm/bL_switcher.h>
+
+static ssize_t bL_switcher_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ unsigned char val[3];
+ unsigned int cpu, cluster;
+ int ret;
+
+ pr_debug("%s\n", __func__);
+
+ if (len < 3)
+ return -EINVAL;
+
+ if (copy_from_user(val, buf, 3))
+ return -EFAULT;
+
+ /* format: <cpu#>,<cluster#> */
+ if (val[0] < '0' || val[0] > '9' ||
+ val[1] != ',' ||
+ val[2] < '0' || val[2] > '1')
+ return -EINVAL;
+
+ cpu = val[0] - '0';
+ cluster = val[2] - '0';
+ ret = bL_switch_request(cpu, cluster);
+
+ return ret ? : len;
+}
+
+static const struct file_operations bL_switcher_fops = {
+ .write = bL_switcher_write,
+ .owner = THIS_MODULE,
+};
+
+static struct miscdevice bL_switcher_device = {
+ MISC_DYNAMIC_MINOR,
+ "b.L_switcher",
+ &bL_switcher_fops
+};
+
+static int __init bL_switcher_dummy_if_init(void)
+{
+ return misc_register(&bL_switcher_device);
+}
+
+static void __exit bL_switcher_dummy_if_exit(void)
+{
+ misc_deregister(&bL_switcher_device);
+}
+
+module_init(bL_switcher_dummy_if_init);
+module_exit(bL_switcher_dummy_if_exit);
sync_cache_w(&mcpm_entry_vectors[cluster][cpu]);
}
+extern unsigned long mcpm_entry_early_pokes[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER][2];
+
+void mcpm_set_early_poke(unsigned cpu, unsigned cluster,
+ unsigned long poke_phys_addr, unsigned long poke_val)
+{
+ unsigned long *poke = &mcpm_entry_early_pokes[cluster][cpu][0];
+ poke[0] = poke_phys_addr;
+ poke[1] = poke_val;
+ __cpuc_flush_dcache_area((void *)poke, 8);
+ outer_clean_range(__pa(poke), __pa(poke + 2));
+}
+
static const struct mcpm_platform_ops *platform_ops;
int __init mcpm_platform_register(const struct mcpm_platform_ops *ops)
* position independent way.
*/
adr r5, 3f
- ldmia r5, {r6, r7, r8, r11}
+ ldmia r5, {r0, r6, r7, r8, r11}
+ add r0, r5, r0 @ r0 = mcpm_entry_early_pokes
add r6, r5, r6 @ r6 = mcpm_entry_vectors
ldr r7, [r5, r7] @ r7 = mcpm_power_up_setup_phys
add r8, r5, r8 @ r8 = mcpm_sync
add r11, r5, r11 @ r11 = first_man_locks
+ @ Perform an early poke, if any
+ add r0, r0, r4, lsl #3
+ ldmia r0, {r0, r1}
+ teq r0, #0
+ strne r1, [r0]
+
mov r0, #MCPM_SYNC_CLUSTER_SIZE
mla r8, r0, r10, r8 @ r8 = sync cluster base
.align 2
-3: .word mcpm_entry_vectors - .
+3: .word mcpm_entry_early_pokes - .
+ .word mcpm_entry_vectors - 3b
.word mcpm_power_up_setup_phys - 3b
.word mcpm_sync - 3b
.word first_man_locks - 3b
ENTRY(mcpm_entry_vectors)
.space 4 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER
+ .type mcpm_entry_early_pokes, #object
+ENTRY(mcpm_entry_early_pokes)
+ .space 8 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER
+
.type mcpm_power_up_setup_phys, #object
ENTRY(mcpm_power_up_setup_phys)
.space 4 @ set by mcpm_sync_init()
static struct irqaction sp804_timer_irq = {
.name = "timer",
- .flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL,
+ .flags = IRQF_TIMER | IRQF_IRQPOLL,
.handler = sp804_timer_interrupt,
.dev_id = &sp804_clockevent,
};
-CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
+CONFIG_NO_HZ_IDLE=y
+CONFIG_HIGH_RES_TIMERS=y
CONFIG_LOG_BUF_SHIFT=14
CONFIG_BLK_DEV_INITRD=y
CONFIG_MODULES=y
CONFIG_SA1100_H3600=y
CONFIG_PCCARD=y
CONFIG_PCMCIA_SA1100=y
+CONFIG_PREEMPT=y
CONFIG_ZBOOT_ROM_TEXT=0x0
CONFIG_ZBOOT_ROM_BSS=0x0
# CONFIG_CPU_FREQ_STAT is not set
CONFIG_FPE_NWFPE=y
-CONFIG_PM=y
CONFIG_NET=y
CONFIG_UNIX=y
CONFIG_INET=y
CONFIG_IRLAN=m
CONFIG_IRNET=m
CONFIG_IRCOMM=m
-CONFIG_SA1100_FIR=m
# CONFIG_WIRELESS is not set
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_REDBOOT_PARTS=y
-CONFIG_MTD_CHAR=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_ADV_OPTIONS=y
CONFIG_BLK_DEV_LOOP=m
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_SIZE=8192
-# CONFIG_MISC_DEVICES is not set
CONFIG_IDE=y
CONFIG_BLK_DEV_IDECS=y
CONFIG_NETDEVICES=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
-# CONFIG_WLAN is not set
-CONFIG_NET_PCMCIA=y
CONFIG_PCMCIA_PCNET=y
CONFIG_PPP=m
-CONFIG_PPP_ASYNC=m
-CONFIG_PPP_DEFLATE=m
CONFIG_PPP_BSDCOMP=m
+CONFIG_PPP_DEFLATE=m
+CONFIG_PPP_ASYNC=m
+# CONFIG_WLAN is not set
# CONFIG_KEYBOARD_ATKBD is not set
CONFIG_KEYBOARD_GPIO=y
# CONFIG_INPUT_MOUSE is not set
# CONFIG_HWMON is not set
CONFIG_FB=y
CONFIG_FB_SA1100=y
-# CONFIG_VGA_CONSOLE is not set
-# CONFIG_HID_SUPPORT is not set
# CONFIG_USB_SUPPORT is not set
CONFIG_EXT2_FS=y
CONFIG_MSDOS_FS=m
CONFIG_CRAMFS=m
CONFIG_NFS_FS=y
CONFIG_NFSD=m
-CONFIG_SMB_FS=m
CONFIG_NLS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
--- /dev/null
+aesbs-core.S
#
obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
+obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
-aes-arm-y := aes-armv4.o aes_glue.o
-sha1-arm-y := sha1-armv4-large.o sha1_glue.o
+aes-arm-y := aes-armv4.o aes_glue.o
+aes-arm-bs-y := aesbs-core.o aesbs-glue.o
+sha1-arm-y := sha1-armv4-large.o sha1_glue.o
+
+quiet_cmd_perl = PERL $@
+ cmd_perl = $(PERL) $(<) > $(@)
+
+$(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
+ $(call cmd,perl)
+
+.PRECIOUS: $(obj)/aesbs-core.S
#include <linux/crypto.h>
#include <crypto/aes.h>
-#define AES_MAXNR 14
+#include "aes_glue.h"
-typedef struct {
- unsigned int rd_key[4 *(AES_MAXNR + 1)];
- int rounds;
-} AES_KEY;
-
-struct AES_CTX {
- AES_KEY enc_key;
- AES_KEY dec_key;
-};
-
-asmlinkage void AES_encrypt(const u8 *in, u8 *out, AES_KEY *ctx);
-asmlinkage void AES_decrypt(const u8 *in, u8 *out, AES_KEY *ctx);
-asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
-asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
+EXPORT_SYMBOL(AES_encrypt);
+EXPORT_SYMBOL(AES_decrypt);
+EXPORT_SYMBOL(private_AES_set_encrypt_key);
+EXPORT_SYMBOL(private_AES_set_decrypt_key);
static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
.cipher = {
.cia_min_keysize = AES_MIN_KEY_SIZE,
.cia_max_keysize = AES_MAX_KEY_SIZE,
- .cia_setkey = aes_set_key,
+ .cia_setkey = aes_set_key,
.cia_encrypt = aes_encrypt,
.cia_decrypt = aes_decrypt
}
--- /dev/null
+
+#define AES_MAXNR 14
+
+struct AES_KEY {
+ unsigned int rd_key[4 * (AES_MAXNR + 1)];
+ int rounds;
+};
+
+struct AES_CTX {
+ struct AES_KEY enc_key;
+ struct AES_KEY dec_key;
+};
+
+asmlinkage void AES_encrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
+asmlinkage void AES_decrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
+asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey,
+ const int bits, struct AES_KEY *key);
+asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey,
+ const int bits, struct AES_KEY *key);
--- /dev/null
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+@ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
+@ granted.
+@ ====================================================================
+
+@ Bit-sliced AES for ARM NEON
+@
+@ February 2012.
+@
+@ This implementation is direct adaptation of bsaes-x86_64 module for
+@ ARM NEON. Except that this module is endian-neutral [in sense that
+@ it can be compiled for either endianness] by courtesy of vld1.8's
+@ neutrality. Initial version doesn't implement interface to OpenSSL,
+@ only low-level primitives and unsupported entry points, just enough
+@ to collect performance results, which for Cortex-A8 core are:
+@
+@ encrypt 19.5 cycles per byte processed with 128-bit key
+@ decrypt 22.1 cycles per byte processed with 128-bit key
+@ key conv. 440 cycles per 128-bit key/0.18 of 8x block
+@
+@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+@ which is [much] worse than anticipated (for further details see
+@ http://www.openssl.org/~appro/Snapdragon-S4.html).
+@
+@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+@ manages in 20.0 cycles].
+@
+@ When comparing to x86_64 results keep in mind that NEON unit is
+@ [mostly] single-issue and thus can't [fully] benefit from
+@ instruction-level parallelism. And when comparing to aes-armv4
+@ results keep in mind key schedule conversion overhead (see
+@ bsaes-x86_64.pl for further details)...
+@
+@ <appro@openssl.org>
+
+@ April-August 2013
+@
+@ Add CBC, CTR and XTS subroutines, adapt for kernel use.
+@
+@ <ard.biesheuvel@linaro.org>
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+# define VFP_ABI_FRAME 0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME 0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_ARCH__>=7
+.text
+.syntax unified @ ARMv7-capable assembler is expected to handle this
+#ifdef __thumb2__
+.thumb
+#else
+.code 32
+#endif
+
+.fpu neon
+
+.type _bsaes_decrypt8,%function
+.align 4
+_bsaes_decrypt8:
+ adr r6,_bsaes_decrypt8
+ vldmia r4!, {q9} @ round 0 key
+ add r6,r6,#.LM0ISR-_bsaes_decrypt8
+
+ vldmia r6!, {q8} @ .LM0ISR
+ veor q10, q0, q9 @ xor with round0 key
+ veor q11, q1, q9
+ vtbl.8 d0, {q10}, d16
+ vtbl.8 d1, {q10}, d17
+ veor q12, q2, q9
+ vtbl.8 d2, {q11}, d16
+ vtbl.8 d3, {q11}, d17
+ veor q13, q3, q9
+ vtbl.8 d4, {q12}, d16
+ vtbl.8 d5, {q12}, d17
+ veor q14, q4, q9
+ vtbl.8 d6, {q13}, d16
+ vtbl.8 d7, {q13}, d17
+ veor q15, q5, q9
+ vtbl.8 d8, {q14}, d16
+ vtbl.8 d9, {q14}, d17
+ veor q10, q6, q9
+ vtbl.8 d10, {q15}, d16
+ vtbl.8 d11, {q15}, d17
+ veor q11, q7, q9
+ vtbl.8 d12, {q10}, d16
+ vtbl.8 d13, {q10}, d17
+ vtbl.8 d14, {q11}, d16
+ vtbl.8 d15, {q11}, d17
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q6, #1
+ vshr.u64 q11, q4, #1
+ veor q10, q10, q7
+ veor q11, q11, q5
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #1
+ veor q5, q5, q11
+ vshl.u64 q11, q11, #1
+ veor q6, q6, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q2, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q3
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q2, q2, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q5, #2
+ vshr.u64 q11, q4, #2
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #2
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #2
+ veor q5, q5, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q3
+ veor q11, q11, q2
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #2
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q3, #4
+ vshr.u64 q11, q2, #4
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #4
+ veor q3, q3, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q5
+ veor q11, q11, q4
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q4, q4, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ sub r5,r5,#1
+ b .Ldec_sbox
+.align 4
+.Ldec_loop:
+ vldmia r4!, {q8-q11}
+ veor q8, q8, q0
+ veor q9, q9, q1
+ vtbl.8 d0, {q8}, d24
+ vtbl.8 d1, {q8}, d25
+ vldmia r4!, {q8}
+ veor q10, q10, q2
+ vtbl.8 d2, {q9}, d24
+ vtbl.8 d3, {q9}, d25
+ vldmia r4!, {q9}
+ veor q11, q11, q3
+ vtbl.8 d4, {q10}, d24
+ vtbl.8 d5, {q10}, d25
+ vldmia r4!, {q10}
+ vtbl.8 d6, {q11}, d24
+ vtbl.8 d7, {q11}, d25
+ vldmia r4!, {q11}
+ veor q8, q8, q4
+ veor q9, q9, q5
+ vtbl.8 d8, {q8}, d24
+ vtbl.8 d9, {q8}, d25
+ veor q10, q10, q6
+ vtbl.8 d10, {q9}, d24
+ vtbl.8 d11, {q9}, d25
+ veor q11, q11, q7
+ vtbl.8 d12, {q10}, d24
+ vtbl.8 d13, {q10}, d25
+ vtbl.8 d14, {q11}, d24
+ vtbl.8 d15, {q11}, d25
+.Ldec_sbox:
+ veor q1, q1, q4
+ veor q3, q3, q4
+
+ veor q4, q4, q7
+ veor q1, q1, q6
+ veor q2, q2, q7
+ veor q6, q6, q4
+
+ veor q0, q0, q1
+ veor q2, q2, q5
+ veor q7, q7, q6
+ veor q3, q3, q0
+ veor q5, q5, q0
+ veor q1, q1, q3
+ veor q11, q3, q0
+ veor q10, q7, q4
+ veor q9, q1, q6
+ veor q13, q4, q0
+ vmov q8, q10
+ veor q12, q5, q2
+
+ vorr q10, q10, q9
+ veor q15, q11, q8
+ vand q14, q11, q12
+ vorr q11, q11, q12
+ veor q12, q12, q9
+ vand q8, q8, q9
+ veor q9, q6, q2
+ vand q15, q15, q12
+ vand q13, q13, q9
+ veor q9, q3, q7
+ veor q12, q1, q5
+ veor q11, q11, q13
+ veor q10, q10, q13
+ vand q13, q9, q12
+ vorr q9, q9, q12
+ veor q11, q11, q15
+ veor q8, q8, q13
+ veor q10, q10, q14
+ veor q9, q9, q15
+ veor q8, q8, q14
+ vand q12, q4, q6
+ veor q9, q9, q14
+ vand q13, q0, q2
+ vand q14, q7, q1
+ vorr q15, q3, q5
+ veor q11, q11, q12
+ veor q9, q9, q14
+ veor q8, q8, q15
+ veor q10, q10, q13
+
+ @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
+
+ @ new smaller inversion
+
+ vand q14, q11, q9
+ vmov q12, q8
+
+ veor q13, q10, q14
+ veor q15, q8, q14
+ veor q14, q8, q14 @ q14=q15
+
+ vbsl q13, q9, q8
+ vbsl q15, q11, q10
+ veor q11, q11, q10
+
+ vbsl q12, q13, q14
+ vbsl q8, q14, q13
+
+ vand q14, q12, q15
+ veor q9, q9, q8
+
+ veor q14, q14, q11
+ veor q12, q5, q2
+ veor q8, q1, q6
+ veor q10, q15, q14
+ vand q10, q10, q5
+ veor q5, q5, q1
+ vand q11, q1, q15
+ vand q5, q5, q14
+ veor q1, q11, q10
+ veor q5, q5, q11
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q2
+ veor q12, q12, q8
+ veor q2, q2, q6
+ vand q8, q8, q15
+ vand q6, q6, q13
+ vand q12, q12, q14
+ vand q2, q2, q9
+ veor q8, q8, q12
+ veor q2, q2, q6
+ veor q12, q12, q11
+ veor q6, q6, q10
+ veor q5, q5, q12
+ veor q2, q2, q12
+ veor q1, q1, q8
+ veor q6, q6, q8
+
+ veor q12, q3, q0
+ veor q8, q7, q4
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q0
+ veor q12, q12, q8
+ veor q0, q0, q4
+ vand q8, q8, q15
+ vand q4, q4, q13
+ vand q12, q12, q14
+ vand q0, q0, q9
+ veor q8, q8, q12
+ veor q0, q0, q4
+ veor q12, q12, q11
+ veor q4, q4, q10
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q10, q15, q14
+ vand q10, q10, q3
+ veor q3, q3, q7
+ vand q11, q7, q15
+ vand q3, q3, q14
+ veor q7, q11, q10
+ veor q3, q3, q11
+ veor q3, q3, q12
+ veor q0, q0, q12
+ veor q7, q7, q8
+ veor q4, q4, q8
+ veor q1, q1, q7
+ veor q6, q6, q5
+
+ veor q4, q4, q1
+ veor q2, q2, q7
+ veor q5, q5, q7
+ veor q4, q4, q2
+ veor q7, q7, q0
+ veor q4, q4, q5
+ veor q3, q3, q6
+ veor q6, q6, q1
+ veor q3, q3, q4
+
+ veor q4, q4, q0
+ veor q7, q7, q3
+ subs r5,r5,#1
+ bcc .Ldec_done
+ @ multiplication by 0x05-0x00-0x04-0x00
+ vext.8 q8, q0, q0, #8
+ vext.8 q14, q3, q3, #8
+ vext.8 q15, q5, q5, #8
+ veor q8, q8, q0
+ vext.8 q9, q1, q1, #8
+ veor q14, q14, q3
+ vext.8 q10, q6, q6, #8
+ veor q15, q15, q5
+ vext.8 q11, q4, q4, #8
+ veor q9, q9, q1
+ vext.8 q12, q2, q2, #8
+ veor q10, q10, q6
+ vext.8 q13, q7, q7, #8
+ veor q11, q11, q4
+ veor q12, q12, q2
+ veor q13, q13, q7
+
+ veor q0, q0, q14
+ veor q1, q1, q14
+ veor q6, q6, q8
+ veor q2, q2, q10
+ veor q4, q4, q9
+ veor q1, q1, q15
+ veor q6, q6, q15
+ veor q2, q2, q14
+ veor q7, q7, q11
+ veor q4, q4, q14
+ veor q3, q3, q12
+ veor q2, q2, q15
+ veor q7, q7, q15
+ veor q5, q5, q13
+ vext.8 q8, q0, q0, #12 @ x0 <<< 32
+ vext.8 q9, q1, q1, #12
+ veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
+ vext.8 q10, q6, q6, #12
+ veor q1, q1, q9
+ vext.8 q11, q4, q4, #12
+ veor q6, q6, q10
+ vext.8 q12, q2, q2, #12
+ veor q4, q4, q11
+ vext.8 q13, q7, q7, #12
+ veor q2, q2, q12
+ vext.8 q14, q3, q3, #12
+ veor q7, q7, q13
+ vext.8 q15, q5, q5, #12
+ veor q3, q3, q14
+
+ veor q9, q9, q0
+ veor q5, q5, q15
+ vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
+ veor q10, q10, q1
+ veor q8, q8, q5
+ veor q9, q9, q5
+ vext.8 q1, q1, q1, #8
+ veor q13, q13, q2
+ veor q0, q0, q8
+ veor q14, q14, q7
+ veor q1, q1, q9
+ vext.8 q8, q2, q2, #8
+ veor q12, q12, q4
+ vext.8 q9, q7, q7, #8
+ veor q15, q15, q3
+ vext.8 q2, q4, q4, #8
+ veor q11, q11, q6
+ vext.8 q7, q5, q5, #8
+ veor q12, q12, q5
+ vext.8 q4, q3, q3, #8
+ veor q11, q11, q5
+ vext.8 q3, q6, q6, #8
+ veor q5, q9, q13
+ veor q11, q11, q2
+ veor q7, q7, q15
+ veor q6, q4, q14
+ veor q4, q8, q12
+ veor q2, q3, q10
+ vmov q3, q11
+ @ vmov q5, q9
+ vldmia r6, {q12} @ .LISR
+ ite eq @ Thumb2 thing, sanity check in ARM
+ addeq r6,r6,#0x10
+ bne .Ldec_loop
+ vldmia r6, {q12} @ .LISRM0
+ b .Ldec_loop
+.align 4
+.Ldec_done:
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q3, #1
+ vshr.u64 q11, q2, #1
+ veor q10, q10, q5
+ veor q11, q11, q7
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #1
+ veor q7, q7, q11
+ vshl.u64 q11, q11, #1
+ veor q3, q3, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q6, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q4
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q4, q4, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q6, q6, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q7, #2
+ vshr.u64 q11, q2, #2
+ veor q10, q10, q5
+ veor q11, q11, q3
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #2
+ veor q3, q3, q11
+ vshl.u64 q11, q11, #2
+ veor q7, q7, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q4
+ veor q11, q11, q6
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q4, q4, q10
+ vshl.u64 q10, q10, #2
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q4, #4
+ vshr.u64 q11, q6, #4
+ veor q10, q10, q5
+ veor q11, q11, q3
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q3, q3, q11
+ vshl.u64 q11, q11, #4
+ veor q4, q4, q10
+ veor q6, q6, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q7
+ veor q11, q11, q2
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vldmia r4, {q8} @ last round key
+ veor q6, q6, q8
+ veor q4, q4, q8
+ veor q2, q2, q8
+ veor q7, q7, q8
+ veor q3, q3, q8
+ veor q5, q5, q8
+ veor q0, q0, q8
+ veor q1, q1, q8
+ bx lr
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type _bsaes_const,%object
+.align 6
+_bsaes_const:
+.LM0ISR: @ InvShiftRows constants
+ .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+ .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+ .quad 0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR: @ ShiftRows constants
+ .quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+ .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+ .quad 0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+ .quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+ .quad 0x090d01050c000408, 0x03070b0f060a0e02
+.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro@openssl.org>"
+.align 6
+.size _bsaes_const,.-_bsaes_const
+
+.type _bsaes_encrypt8,%function
+.align 4
+_bsaes_encrypt8:
+ adr r6,_bsaes_encrypt8
+ vldmia r4!, {q9} @ round 0 key
+ sub r6,r6,#_bsaes_encrypt8-.LM0SR
+
+ vldmia r6!, {q8} @ .LM0SR
+_bsaes_encrypt8_alt:
+ veor q10, q0, q9 @ xor with round0 key
+ veor q11, q1, q9
+ vtbl.8 d0, {q10}, d16
+ vtbl.8 d1, {q10}, d17
+ veor q12, q2, q9
+ vtbl.8 d2, {q11}, d16
+ vtbl.8 d3, {q11}, d17
+ veor q13, q3, q9
+ vtbl.8 d4, {q12}, d16
+ vtbl.8 d5, {q12}, d17
+ veor q14, q4, q9
+ vtbl.8 d6, {q13}, d16
+ vtbl.8 d7, {q13}, d17
+ veor q15, q5, q9
+ vtbl.8 d8, {q14}, d16
+ vtbl.8 d9, {q14}, d17
+ veor q10, q6, q9
+ vtbl.8 d10, {q15}, d16
+ vtbl.8 d11, {q15}, d17
+ veor q11, q7, q9
+ vtbl.8 d12, {q10}, d16
+ vtbl.8 d13, {q10}, d17
+ vtbl.8 d14, {q11}, d16
+ vtbl.8 d15, {q11}, d17
+_bsaes_encrypt8_bitslice:
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q6, #1
+ vshr.u64 q11, q4, #1
+ veor q10, q10, q7
+ veor q11, q11, q5
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #1
+ veor q5, q5, q11
+ vshl.u64 q11, q11, #1
+ veor q6, q6, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q2, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q3
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q2, q2, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q5, #2
+ vshr.u64 q11, q4, #2
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #2
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #2
+ veor q5, q5, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q3
+ veor q11, q11, q2
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #2
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q3, #4
+ vshr.u64 q11, q2, #4
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #4
+ veor q3, q3, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q5
+ veor q11, q11, q4
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q4, q4, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ sub r5,r5,#1
+ b .Lenc_sbox
+.align 4
+.Lenc_loop:
+ vldmia r4!, {q8-q11}
+ veor q8, q8, q0
+ veor q9, q9, q1
+ vtbl.8 d0, {q8}, d24
+ vtbl.8 d1, {q8}, d25
+ vldmia r4!, {q8}
+ veor q10, q10, q2
+ vtbl.8 d2, {q9}, d24
+ vtbl.8 d3, {q9}, d25
+ vldmia r4!, {q9}
+ veor q11, q11, q3
+ vtbl.8 d4, {q10}, d24
+ vtbl.8 d5, {q10}, d25
+ vldmia r4!, {q10}
+ vtbl.8 d6, {q11}, d24
+ vtbl.8 d7, {q11}, d25
+ vldmia r4!, {q11}
+ veor q8, q8, q4
+ veor q9, q9, q5
+ vtbl.8 d8, {q8}, d24
+ vtbl.8 d9, {q8}, d25
+ veor q10, q10, q6
+ vtbl.8 d10, {q9}, d24
+ vtbl.8 d11, {q9}, d25
+ veor q11, q11, q7
+ vtbl.8 d12, {q10}, d24
+ vtbl.8 d13, {q10}, d25
+ vtbl.8 d14, {q11}, d24
+ vtbl.8 d15, {q11}, d25
+.Lenc_sbox:
+ veor q2, q2, q1
+ veor q5, q5, q6
+ veor q3, q3, q0
+ veor q6, q6, q2
+ veor q5, q5, q0
+
+ veor q6, q6, q3
+ veor q3, q3, q7
+ veor q7, q7, q5
+ veor q3, q3, q4
+ veor q4, q4, q5
+
+ veor q2, q2, q7
+ veor q3, q3, q1
+ veor q1, q1, q5
+ veor q11, q7, q4
+ veor q10, q1, q2
+ veor q9, q5, q3
+ veor q13, q2, q4
+ vmov q8, q10
+ veor q12, q6, q0
+
+ vorr q10, q10, q9
+ veor q15, q11, q8
+ vand q14, q11, q12
+ vorr q11, q11, q12
+ veor q12, q12, q9
+ vand q8, q8, q9
+ veor q9, q3, q0
+ vand q15, q15, q12
+ vand q13, q13, q9
+ veor q9, q7, q1
+ veor q12, q5, q6
+ veor q11, q11, q13
+ veor q10, q10, q13
+ vand q13, q9, q12
+ vorr q9, q9, q12
+ veor q11, q11, q15
+ veor q8, q8, q13
+ veor q10, q10, q14
+ veor q9, q9, q15
+ veor q8, q8, q14
+ vand q12, q2, q3
+ veor q9, q9, q14
+ vand q13, q4, q0
+ vand q14, q1, q5
+ vorr q15, q7, q6
+ veor q11, q11, q12
+ veor q9, q9, q14
+ veor q8, q8, q15
+ veor q10, q10, q13
+
+ @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
+
+ @ new smaller inversion
+
+ vand q14, q11, q9
+ vmov q12, q8
+
+ veor q13, q10, q14
+ veor q15, q8, q14
+ veor q14, q8, q14 @ q14=q15
+
+ vbsl q13, q9, q8
+ vbsl q15, q11, q10
+ veor q11, q11, q10
+
+ vbsl q12, q13, q14
+ vbsl q8, q14, q13
+
+ vand q14, q12, q15
+ veor q9, q9, q8
+
+ veor q14, q14, q11
+ veor q12, q6, q0
+ veor q8, q5, q3
+ veor q10, q15, q14
+ vand q10, q10, q6
+ veor q6, q6, q5
+ vand q11, q5, q15
+ vand q6, q6, q14
+ veor q5, q11, q10
+ veor q6, q6, q11
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q0
+ veor q12, q12, q8
+ veor q0, q0, q3
+ vand q8, q8, q15
+ vand q3, q3, q13
+ vand q12, q12, q14
+ vand q0, q0, q9
+ veor q8, q8, q12
+ veor q0, q0, q3
+ veor q12, q12, q11
+ veor q3, q3, q10
+ veor q6, q6, q12
+ veor q0, q0, q12
+ veor q5, q5, q8
+ veor q3, q3, q8
+
+ veor q12, q7, q4
+ veor q8, q1, q2
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q4
+ veor q12, q12, q8
+ veor q4, q4, q2
+ vand q8, q8, q15
+ vand q2, q2, q13
+ vand q12, q12, q14
+ vand q4, q4, q9
+ veor q8, q8, q12
+ veor q4, q4, q2
+ veor q12, q12, q11
+ veor q2, q2, q10
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q10, q15, q14
+ vand q10, q10, q7
+ veor q7, q7, q1
+ vand q11, q1, q15
+ vand q7, q7, q14
+ veor q1, q11, q10
+ veor q7, q7, q11
+ veor q7, q7, q12
+ veor q4, q4, q12
+ veor q1, q1, q8
+ veor q2, q2, q8
+ veor q7, q7, q0
+ veor q1, q1, q6
+ veor q6, q6, q0
+ veor q4, q4, q7
+ veor q0, q0, q1
+
+ veor q1, q1, q5
+ veor q5, q5, q2
+ veor q2, q2, q3
+ veor q3, q3, q5
+ veor q4, q4, q5
+
+ veor q6, q6, q3
+ subs r5,r5,#1
+ bcc .Lenc_done
+ vext.8 q8, q0, q0, #12 @ x0 <<< 32
+ vext.8 q9, q1, q1, #12
+ veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
+ vext.8 q10, q4, q4, #12
+ veor q1, q1, q9
+ vext.8 q11, q6, q6, #12
+ veor q4, q4, q10
+ vext.8 q12, q3, q3, #12
+ veor q6, q6, q11
+ vext.8 q13, q7, q7, #12
+ veor q3, q3, q12
+ vext.8 q14, q2, q2, #12
+ veor q7, q7, q13
+ vext.8 q15, q5, q5, #12
+ veor q2, q2, q14
+
+ veor q9, q9, q0
+ veor q5, q5, q15
+ vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
+ veor q10, q10, q1
+ veor q8, q8, q5
+ veor q9, q9, q5
+ vext.8 q1, q1, q1, #8
+ veor q13, q13, q3
+ veor q0, q0, q8
+ veor q14, q14, q7
+ veor q1, q1, q9
+ vext.8 q8, q3, q3, #8
+ veor q12, q12, q6
+ vext.8 q9, q7, q7, #8
+ veor q15, q15, q2
+ vext.8 q3, q6, q6, #8
+ veor q11, q11, q4
+ vext.8 q7, q5, q5, #8
+ veor q12, q12, q5
+ vext.8 q6, q2, q2, #8
+ veor q11, q11, q5
+ vext.8 q2, q4, q4, #8
+ veor q5, q9, q13
+ veor q4, q8, q12
+ veor q3, q3, q11
+ veor q7, q7, q15
+ veor q6, q6, q14
+ @ vmov q4, q8
+ veor q2, q2, q10
+ @ vmov q5, q9
+ vldmia r6, {q12} @ .LSR
+ ite eq @ Thumb2 thing, samity check in ARM
+ addeq r6,r6,#0x10
+ bne .Lenc_loop
+ vldmia r6, {q12} @ .LSRM0
+ b .Lenc_loop
+.align 4
+.Lenc_done:
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q2, #1
+ vshr.u64 q11, q3, #1
+ veor q10, q10, q5
+ veor q11, q11, q7
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #1
+ veor q7, q7, q11
+ vshl.u64 q11, q11, #1
+ veor q2, q2, q10
+ veor q3, q3, q11
+ vshr.u64 q10, q4, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q6
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q6, q6, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q4, q4, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q7, #2
+ vshr.u64 q11, q3, #2
+ veor q10, q10, q5
+ veor q11, q11, q2
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #2
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #2
+ veor q7, q7, q10
+ veor q3, q3, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q6
+ veor q11, q11, q4
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q6, q6, q10
+ vshl.u64 q10, q10, #2
+ veor q4, q4, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q6, #4
+ vshr.u64 q11, q4, #4
+ veor q10, q10, q5
+ veor q11, q11, q2
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #4
+ veor q6, q6, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q7
+ veor q11, q11, q3
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q3, q3, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vldmia r4, {q8} @ last round key
+ veor q4, q4, q8
+ veor q6, q6, q8
+ veor q3, q3, q8
+ veor q7, q7, q8
+ veor q2, q2, q8
+ veor q5, q5, q8
+ veor q0, q0, q8
+ veor q1, q1, q8
+ bx lr
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+.type _bsaes_key_convert,%function
+.align 4
+_bsaes_key_convert:
+ adr r6,_bsaes_key_convert
+ vld1.8 {q7}, [r4]! @ load round 0 key
+ sub r6,r6,#_bsaes_key_convert-.LM0
+ vld1.8 {q15}, [r4]! @ load round 1 key
+
+ vmov.i8 q8, #0x01 @ bit masks
+ vmov.i8 q9, #0x02
+ vmov.i8 q10, #0x04
+ vmov.i8 q11, #0x08
+ vmov.i8 q12, #0x10
+ vmov.i8 q13, #0x20
+ vldmia r6, {q14} @ .LM0
+
+#ifdef __ARMEL__
+ vrev32.8 q7, q7
+ vrev32.8 q15, q15
+#endif
+ sub r5,r5,#1
+ vstmia r12!, {q7} @ save round 0 key
+ b .Lkey_loop
+
+.align 4
+.Lkey_loop:
+ vtbl.8 d14,{q15},d28
+ vtbl.8 d15,{q15},d29
+ vmov.i8 q6, #0x40
+ vmov.i8 q15, #0x80
+
+ vtst.8 q0, q7, q8
+ vtst.8 q1, q7, q9
+ vtst.8 q2, q7, q10
+ vtst.8 q3, q7, q11
+ vtst.8 q4, q7, q12
+ vtst.8 q5, q7, q13
+ vtst.8 q6, q7, q6
+ vtst.8 q7, q7, q15
+ vld1.8 {q15}, [r4]! @ load next round key
+ vmvn q0, q0 @ "pnot"
+ vmvn q1, q1
+ vmvn q5, q5
+ vmvn q6, q6
+#ifdef __ARMEL__
+ vrev32.8 q15, q15
+#endif
+ subs r5,r5,#1
+ vstmia r12!,{q0-q7} @ write bit-sliced round key
+ bne .Lkey_loop
+
+ vmov.i8 q7,#0x63 @ compose .L63
+ @ don't save last round key
+ bx lr
+.size _bsaes_key_convert,.-_bsaes_key_convert
+.extern AES_cbc_encrypt
+.extern AES_decrypt
+
+.global bsaes_cbc_encrypt
+.type bsaes_cbc_encrypt,%function
+.align 5
+bsaes_cbc_encrypt:
+#ifndef __KERNEL__
+ cmp r2, #128
+#ifndef __thumb__
+ blo AES_cbc_encrypt
+#else
+ bhs 1f
+ b AES_cbc_encrypt
+1:
+#endif
+#endif
+
+ @ it is up to the caller to make sure we are called with enc == 0
+
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr}
+ VFP_ABI_PUSH
+ ldr r8, [ip] @ IV is 1st arg on the stack
+ mov r2, r2, lsr#4 @ len in 16 byte blocks
+ sub sp, #0x10 @ scratch space to carry over the IV
+ mov r9, sp @ save sp
+
+ ldr r10, [r3, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
+ add r12, #96 @ sifze of bit-slices key schedule
+
+ @ populate the key schedule
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ mov sp, r12 @ sp is sp
+ bl _bsaes_key_convert
+ vldmia sp, {q6}
+ vstmia r12, {q15} @ save last round key
+ veor q7, q7, q6 @ fix up round 0 key
+ vstmia sp, {q7}
+#else
+ ldr r12, [r3, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [r3, #244]
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ add r12, r3, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, r3, #248
+ vldmia r4, {q6}
+ vstmia r12, {q15} @ save last round key
+ veor q7, q7, q6 @ fix up round 0 key
+ vstmia r4, {q7}
+
+.align 2
+0:
+#endif
+
+ vld1.8 {q15}, [r8] @ load IV
+ b .Lcbc_dec_loop
+
+.align 4
+.Lcbc_dec_loop:
+ subs r2, r2, #0x8
+ bmi .Lcbc_dec_loop_finish
+
+ vld1.8 {q0-q1}, [r0]! @ load input
+ vld1.8 {q2-q3}, [r0]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, sp @ pass the key
+#else
+ add r4, r3, #248
+#endif
+ vld1.8 {q4-q5}, [r0]!
+ mov r5, r10
+ vld1.8 {q6-q7}, [r0]
+ sub r0, r0, #0x60
+ vstmia r9, {q15} @ put aside IV
+
+ bl _bsaes_decrypt8
+
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8-q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10-q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q12-q13}, [r0]!
+ veor q4, q4, q10
+ veor q2, q2, q11
+ vld1.8 {q14-q15}, [r0]!
+ veor q7, q7, q12
+ vst1.8 {q0-q1}, [r1]! @ write output
+ veor q3, q3, q13
+ vst1.8 {q6}, [r1]!
+ veor q5, q5, q14
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ vst1.8 {q7}, [r1]!
+ vst1.8 {q3}, [r1]!
+ vst1.8 {q5}, [r1]!
+
+ b .Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+ adds r2, r2, #8
+ beq .Lcbc_dec_done
+
+ vld1.8 {q0}, [r0]! @ load input
+ cmp r2, #2
+ blo .Lcbc_dec_one
+ vld1.8 {q1}, [r0]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, sp @ pass the key
+#else
+ add r4, r3, #248
+#endif
+ mov r5, r10
+ vstmia r9, {q15} @ put aside IV
+ beq .Lcbc_dec_two
+ vld1.8 {q2}, [r0]!
+ cmp r2, #4
+ blo .Lcbc_dec_three
+ vld1.8 {q3}, [r0]!
+ beq .Lcbc_dec_four
+ vld1.8 {q4}, [r0]!
+ cmp r2, #6
+ blo .Lcbc_dec_five
+ vld1.8 {q5}, [r0]!
+ beq .Lcbc_dec_six
+ vld1.8 {q6}, [r0]!
+ sub r0, r0, #0x70
+
+ bl _bsaes_decrypt8
+
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8-q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10-q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q12-q13}, [r0]!
+ veor q4, q4, q10
+ veor q2, q2, q11
+ vld1.8 {q15}, [r0]!
+ veor q7, q7, q12
+ vst1.8 {q0-q1}, [r1]! @ write output
+ veor q3, q3, q13
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ vst1.8 {q7}, [r1]!
+ vst1.8 {q3}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_six:
+ sub r0, r0, #0x60
+ bl _bsaes_decrypt8
+ vldmia r9,{q14} @ reload IV
+ vld1.8 {q8-q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10-q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q12}, [r0]!
+ veor q4, q4, q10
+ veor q2, q2, q11
+ vld1.8 {q15}, [r0]!
+ veor q7, q7, q12
+ vst1.8 {q0-q1}, [r1]! @ write output
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ vst1.8 {q7}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_five:
+ sub r0, r0, #0x50
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8-q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10-q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q15}, [r0]!
+ veor q4, q4, q10
+ vst1.8 {q0-q1}, [r1]! @ write output
+ veor q2, q2, q11
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_four:
+ sub r0, r0, #0x40
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8-q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q15}, [r0]!
+ veor q4, q4, q10
+ vst1.8 {q0-q1}, [r1]! @ write output
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_three:
+ sub r0, r0, #0x30
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8-q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q15}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vst1.8 {q0-q1}, [r1]! @ write output
+ vst1.8 {q6}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_two:
+ sub r0, r0, #0x20
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q15}, [r0]! @ reload input
+ veor q1, q1, q8
+ vst1.8 {q0-q1}, [r1]! @ write output
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_one:
+ sub r0, r0, #0x10
+ mov r10, r1 @ save original out pointer
+ mov r1, r9 @ use the iv scratch space as out buffer
+ mov r2, r3
+ vmov q4,q15 @ just in case ensure that IV
+ vmov q5,q0 @ and input are preserved
+ bl AES_decrypt
+ vld1.8 {q0}, [r9,:64] @ load result
+ veor q0, q0, q4 @ ^= IV
+ vmov q15, q5 @ q5 holds input
+ vst1.8 {q0}, [r10] @ write output
+
+.Lcbc_dec_done:
+#ifndef BSAES_ASM_EXTENDED_KEY
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+.Lcbc_dec_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r9
+ bne .Lcbc_dec_bzero
+#endif
+
+ mov sp, r9
+ add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
+ vst1.8 {q15}, [r8] @ return IV
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc}
+.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+.extern AES_encrypt
+.global bsaes_ctr32_encrypt_blocks
+.type bsaes_ctr32_encrypt_blocks,%function
+.align 5
+bsaes_ctr32_encrypt_blocks:
+ cmp r2, #8 @ use plain AES for
+ blo .Lctr_enc_short @ small sizes
+
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr}
+ VFP_ABI_PUSH
+ ldr r8, [ip] @ ctr is 1st arg on the stack
+ sub sp, sp, #0x10 @ scratch space to carry over the ctr
+ mov r9, sp @ save sp
+
+ ldr r10, [r3, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
+ add r12, #96 @ size of bit-sliced key schedule
+
+ @ populate the key schedule
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ mov sp, r12 @ sp is sp
+ bl _bsaes_key_convert
+ veor q7,q7,q15 @ fix up last round key
+ vstmia r12, {q7} @ save last round key
+
+ vld1.8 {q0}, [r8] @ load counter
+ add r8, r6, #.LREVM0SR-.LM0 @ borrow r8
+ vldmia sp, {q4} @ load round0 key
+#else
+ ldr r12, [r3, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [r3, #244]
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ add r12, r3, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor q7,q7,q15 @ fix up last round key
+ vstmia r12, {q7} @ save last round key
+
+.align 2
+0: add r12, r3, #248
+ vld1.8 {q0}, [r8] @ load counter
+ adrl r8, .LREVM0SR @ borrow r8
+ vldmia r12, {q4} @ load round0 key
+ sub sp, #0x10 @ place for adjusted round0 key
+#endif
+
+ vmov.i32 q8,#1 @ compose 1<<96
+ veor q9,q9,q9
+ vrev32.8 q0,q0
+ vext.8 q8,q9,q8,#4
+ vrev32.8 q4,q4
+ vadd.u32 q9,q8,q8 @ compose 2<<96
+ vstmia sp, {q4} @ save adjusted round0 key
+ b .Lctr_enc_loop
+
+.align 4
+.Lctr_enc_loop:
+ vadd.u32 q10, q8, q9 @ compose 3<<96
+ vadd.u32 q1, q0, q8 @ +1
+ vadd.u32 q2, q0, q9 @ +2
+ vadd.u32 q3, q0, q10 @ +3
+ vadd.u32 q4, q1, q10
+ vadd.u32 q5, q2, q10
+ vadd.u32 q6, q3, q10
+ vadd.u32 q7, q4, q10
+ vadd.u32 q10, q5, q10 @ next counter
+
+ @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+ @ to flip byte order in 32-bit counter
+
+ vldmia sp, {q9} @ load round0 key
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x10 @ pass next round key
+#else
+ add r4, r3, #264
+#endif
+ vldmia r8, {q8} @ .LREVM0SR
+ mov r5, r10 @ pass rounds
+ vstmia r9, {q10} @ save next counter
+ sub r6, r8, #.LREVM0SR-.LSR @ pass constants
+
+ bl _bsaes_encrypt8_alt
+
+ subs r2, r2, #8
+ blo .Lctr_enc_loop_done
+
+ vld1.8 {q8-q9}, [r0]! @ load input
+ vld1.8 {q10-q11}, [r0]!
+ veor q0, q8
+ veor q1, q9
+ vld1.8 {q12-q13}, [r0]!
+ veor q4, q10
+ veor q6, q11
+ vld1.8 {q14-q15}, [r0]!
+ veor q3, q12
+ vst1.8 {q0-q1}, [r1]! @ write output
+ veor q7, q13
+ veor q2, q14
+ vst1.8 {q4}, [r1]!
+ veor q5, q15
+ vst1.8 {q6}, [r1]!
+ vmov.i32 q8, #1 @ compose 1<<96
+ vst1.8 {q3}, [r1]!
+ veor q9, q9, q9
+ vst1.8 {q7}, [r1]!
+ vext.8 q8, q9, q8, #4
+ vst1.8 {q2}, [r1]!
+ vadd.u32 q9,q8,q8 @ compose 2<<96
+ vst1.8 {q5}, [r1]!
+ vldmia r9, {q0} @ load counter
+
+ bne .Lctr_enc_loop
+ b .Lctr_enc_done
+
+.align 4
+.Lctr_enc_loop_done:
+ add r2, r2, #8
+ vld1.8 {q8}, [r0]! @ load input
+ veor q0, q8
+ vst1.8 {q0}, [r1]! @ write output
+ cmp r2, #2
+ blo .Lctr_enc_done
+ vld1.8 {q9}, [r0]!
+ veor q1, q9
+ vst1.8 {q1}, [r1]!
+ beq .Lctr_enc_done
+ vld1.8 {q10}, [r0]!
+ veor q4, q10
+ vst1.8 {q4}, [r1]!
+ cmp r2, #4
+ blo .Lctr_enc_done
+ vld1.8 {q11}, [r0]!
+ veor q6, q11
+ vst1.8 {q6}, [r1]!
+ beq .Lctr_enc_done
+ vld1.8 {q12}, [r0]!
+ veor q3, q12
+ vst1.8 {q3}, [r1]!
+ cmp r2, #6
+ blo .Lctr_enc_done
+ vld1.8 {q13}, [r0]!
+ veor q7, q13
+ vst1.8 {q7}, [r1]!
+ beq .Lctr_enc_done
+ vld1.8 {q14}, [r0]
+ veor q2, q14
+ vst1.8 {q2}, [r1]!
+
+.Lctr_enc_done:
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifndef BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r9
+ bne .Lctr_enc_bzero
+#else
+ vstmia sp, {q0-q1}
+#endif
+
+ mov sp, r9
+ add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.align 4
+.Lctr_enc_short:
+ ldr ip, [sp] @ ctr pointer is passed on stack
+ stmdb sp!, {r4-r8, lr}
+
+ mov r4, r0 @ copy arguments
+ mov r5, r1
+ mov r6, r2
+ mov r7, r3
+ ldr r8, [ip, #12] @ load counter LSW
+ vld1.8 {q1}, [ip] @ load whole counter value
+#ifdef __ARMEL__
+ rev r8, r8
+#endif
+ sub sp, sp, #0x10
+ vst1.8 {q1}, [sp,:64] @ copy counter value
+ sub sp, sp, #0x10
+
+.Lctr_enc_short_loop:
+ add r0, sp, #0x10 @ input counter value
+ mov r1, sp @ output on the stack
+ mov r2, r7 @ key
+
+ bl AES_encrypt
+
+ vld1.8 {q0}, [r4]! @ load input
+ vld1.8 {q1}, [sp,:64] @ load encrypted counter
+ add r8, r8, #1
+#ifdef __ARMEL__
+ rev r0, r8
+ str r0, [sp, #0x1c] @ next counter value
+#else
+ str r8, [sp, #0x1c] @ next counter value
+#endif
+ veor q0,q0,q1
+ vst1.8 {q0}, [r5]! @ store output
+ subs r6, r6, #1
+ bne .Lctr_enc_short_loop
+
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+ vstmia sp!, {q0-q1}
+
+ ldmia sp!, {r4-r8, pc}
+.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+.globl bsaes_xts_encrypt
+.type bsaes_xts_encrypt,%function
+.align 4
+bsaes_xts_encrypt:
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr} @ 0x20
+ VFP_ABI_PUSH
+ mov r6, sp @ future r3
+
+ mov r7, r0
+ mov r8, r1
+ mov r9, r2
+ mov r10, r3
+
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
+ @ generate initial tweak
+ ldr r0, [ip, #4] @ iv[]
+ mov r1, sp
+ ldr r2, [ip, #0] @ key2
+ bl AES_encrypt
+ mov r0,sp @ pointer to initial tweak
+#endif
+
+ ldr r1, [r10, #240] @ get # of rounds
+ mov r3, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #96 @ size of bit-sliced key schedule
+ sub r12, #48 @ place for tweak[9]
+
+ @ populate the key schedule
+ mov r4, r10 @ pass key
+ mov r5, r1 @ pass # of rounds
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
+ bl _bsaes_key_convert
+ veor q7, q7, q15 @ fix up last round key
+ vstmia r12, {q7} @ save last round key
+#else
+ ldr r12, [r10, #244]
+ eors r12, #1
+ beq 0f
+
+ str r12, [r10, #244]
+ mov r4, r10 @ pass key
+ mov r5, r1 @ pass # of rounds
+ add r12, r10, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor q7, q7, q15 @ fix up last round key
+ vstmia r12, {q7}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
+
+ vld1.8 {q8}, [r0] @ initial tweak
+ adr r2, .Lxts_magic
+
+ subs r9, #0x80
+ blo .Lxts_enc_short
+ b .Lxts_enc_loop
+
+.align 4
+.Lxts_enc_loop:
+ vldmia r2, {q5} @ load XTS magic
+ vshr.s64 q6, q8, #63
+ mov r0, sp
+ vand q6, q6, q5
+ vadd.u64 q9, q8, q8
+ vst1.64 {q8}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q9, #63
+ veor q9, q9, q6
+ vand q7, q7, q5
+ vadd.u64 q10, q9, q9
+ vst1.64 {q9}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q10, #63
+ veor q10, q10, q7
+ vand q6, q6, q5
+ vld1.8 {q0}, [r7]!
+ vadd.u64 q11, q10, q10
+ vst1.64 {q10}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q11, #63
+ veor q11, q11, q6
+ vand q7, q7, q5
+ vld1.8 {q1}, [r7]!
+ veor q0, q0, q8
+ vadd.u64 q12, q11, q11
+ vst1.64 {q11}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q12, #63
+ veor q12, q12, q7
+ vand q6, q6, q5
+ vld1.8 {q2}, [r7]!
+ veor q1, q1, q9
+ vadd.u64 q13, q12, q12
+ vst1.64 {q12}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q13, #63
+ veor q13, q13, q6
+ vand q7, q7, q5
+ vld1.8 {q3}, [r7]!
+ veor q2, q2, q10
+ vadd.u64 q14, q13, q13
+ vst1.64 {q13}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q14, #63
+ veor q14, q14, q7
+ vand q6, q6, q5
+ vld1.8 {q4}, [r7]!
+ veor q3, q3, q11
+ vadd.u64 q15, q14, q14
+ vst1.64 {q14}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q15, #63
+ veor q15, q15, q6
+ vand q7, q7, q5
+ vld1.8 {q5}, [r7]!
+ veor q4, q4, q12
+ vadd.u64 q8, q15, q15
+ vst1.64 {q15}, [r0,:128]!
+ vswp d15,d14
+ veor q8, q8, q7
+ vst1.64 {q8}, [r0,:128] @ next round tweak
+
+ vld1.8 {q6-q7}, [r7]!
+ veor q5, q5, q13
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q6, q6, q14
+ mov r5, r1 @ pass rounds
+ veor q7, q7, q15
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12-q13}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q4, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q6, q11
+ vld1.64 {q14-q15}, [r0,:128]!
+ veor q10, q3, q12
+ vst1.8 {q8-q9}, [r8]!
+ veor q11, q7, q13
+ veor q12, q2, q14
+ vst1.8 {q10-q11}, [r8]!
+ veor q13, q5, q15
+ vst1.8 {q12-q13}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+
+ subs r9, #0x80
+ bpl .Lxts_enc_loop
+
+.Lxts_enc_short:
+ adds r9, #0x70
+ bmi .Lxts_enc_done
+
+ vldmia r2, {q5} @ load XTS magic
+ vshr.s64 q7, q8, #63
+ mov r0, sp
+ vand q7, q7, q5
+ vadd.u64 q9, q8, q8
+ vst1.64 {q8}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q9, #63
+ veor q9, q9, q7
+ vand q6, q6, q5
+ vadd.u64 q10, q9, q9
+ vst1.64 {q9}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q10, #63
+ veor q10, q10, q6
+ vand q7, q7, q5
+ vld1.8 {q0}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_enc_1
+ vadd.u64 q11, q10, q10
+ vst1.64 {q10}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q11, #63
+ veor q11, q11, q7
+ vand q6, q6, q5
+ vld1.8 {q1}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_enc_2
+ veor q0, q0, q8
+ vadd.u64 q12, q11, q11
+ vst1.64 {q11}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q12, #63
+ veor q12, q12, q6
+ vand q7, q7, q5
+ vld1.8 {q2}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_enc_3
+ veor q1, q1, q9
+ vadd.u64 q13, q12, q12
+ vst1.64 {q12}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q13, #63
+ veor q13, q13, q7
+ vand q6, q6, q5
+ vld1.8 {q3}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_enc_4
+ veor q2, q2, q10
+ vadd.u64 q14, q13, q13
+ vst1.64 {q13}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q14, #63
+ veor q14, q14, q6
+ vand q7, q7, q5
+ vld1.8 {q4}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_enc_5
+ veor q3, q3, q11
+ vadd.u64 q15, q14, q14
+ vst1.64 {q14}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q15, #63
+ veor q15, q15, q7
+ vand q6, q6, q5
+ vld1.8 {q5}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_enc_6
+ veor q4, q4, q12
+ sub r9, #0x10
+ vst1.64 {q15}, [r0,:128] @ next round tweak
+
+ vld1.8 {q6}, [r7]!
+ veor q5, q5, q13
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q6, q6, q14
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12-q13}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q4, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q6, q11
+ vld1.64 {q14}, [r0,:128]!
+ veor q10, q3, q12
+ vst1.8 {q8-q9}, [r8]!
+ veor q11, q7, q13
+ veor q12, q2, q14
+ vst1.8 {q10-q11}, [r8]!
+ vst1.8 {q12}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_6:
+ vst1.64 {q14}, [r0,:128] @ next round tweak
+
+ veor q4, q4, q12
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q5, q5, q13
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12-q13}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q4, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q6, q11
+ veor q10, q3, q12
+ vst1.8 {q8-q9}, [r8]!
+ veor q11, q7, q13
+ vst1.8 {q10-q11}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+
+@ put this in range for both ARM and Thumb mode adr instructions
+.align 5
+.Lxts_magic:
+ .quad 1, 0x87
+
+.align 5
+.Lxts_enc_5:
+ vst1.64 {q13}, [r0,:128] @ next round tweak
+
+ veor q3, q3, q11
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q4, q4, q12
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q4, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q6, q11
+ veor q10, q3, q12
+ vst1.8 {q8-q9}, [r8]!
+ vst1.8 {q10}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_4:
+ vst1.64 {q12}, [r0,:128] @ next round tweak
+
+ veor q2, q2, q10
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q3, q3, q11
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ veor q1, q1, q9
+ veor q8, q4, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q6, q11
+ vst1.8 {q8-q9}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_3:
+ vst1.64 {q11}, [r0,:128] @ next round tweak
+
+ veor q1, q1, q9
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q2, q2, q10
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10}, [r0,:128]!
+ veor q0, q0, q8
+ veor q1, q1, q9
+ veor q8, q4, q10
+ vst1.8 {q0-q1}, [r8]!
+ vst1.8 {q8}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_2:
+ vst1.64 {q10}, [r0,:128] @ next round tweak
+
+ veor q0, q0, q8
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q1, q1, q9
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ veor q0, q0, q8
+ veor q1, q1, q9
+ vst1.8 {q0-q1}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_1:
+ mov r0, sp
+ veor q0, q8
+ mov r1, sp
+ vst1.8 {q0}, [sp,:128]
+ mov r2, r10
+ mov r4, r3 @ preserve fp
+
+ bl AES_encrypt
+
+ vld1.8 {q0}, [sp,:128]
+ veor q0, q0, q8
+ vst1.8 {q0}, [r8]!
+ mov r3, r4
+
+ vmov q8, q9 @ next round tweak
+
+.Lxts_enc_done:
+#ifndef XTS_CHAIN_TWEAK
+ adds r9, #0x10
+ beq .Lxts_enc_ret
+ sub r6, r8, #0x10
+
+.Lxts_enc_steal:
+ ldrb r0, [r7], #1
+ ldrb r1, [r8, #-0x10]
+ strb r0, [r8, #-0x10]
+ strb r1, [r8], #1
+
+ subs r9, #1
+ bhi .Lxts_enc_steal
+
+ vld1.8 {q0}, [r6]
+ mov r0, sp
+ veor q0, q0, q8
+ mov r1, sp
+ vst1.8 {q0}, [sp,:128]
+ mov r2, r10
+ mov r4, r3 @ preserve fp
+
+ bl AES_encrypt
+
+ vld1.8 {q0}, [sp,:128]
+ veor q0, q0, q8
+ vst1.8 {q0}, [r6]
+ mov r3, r4
+#endif
+
+.Lxts_enc_ret:
+ bic r0, r3, #0xf
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
+.Lxts_enc_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r0
+ bne .Lxts_enc_bzero
+
+ mov sp, r3
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {q8}, [r1]
+#endif
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type bsaes_xts_decrypt,%function
+.align 4
+bsaes_xts_decrypt:
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr} @ 0x20
+ VFP_ABI_PUSH
+ mov r6, sp @ future r3
+
+ mov r7, r0
+ mov r8, r1
+ mov r9, r2
+ mov r10, r3
+
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
+ @ generate initial tweak
+ ldr r0, [ip, #4] @ iv[]
+ mov r1, sp
+ ldr r2, [ip, #0] @ key2
+ bl AES_encrypt
+ mov r0, sp @ pointer to initial tweak
+#endif
+
+ ldr r1, [r10, #240] @ get # of rounds
+ mov r3, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #96 @ size of bit-sliced key schedule
+ sub r12, #48 @ place for tweak[9]
+
+ @ populate the key schedule
+ mov r4, r10 @ pass key
+ mov r5, r1 @ pass # of rounds
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, sp, #0x90
+ vldmia r4, {q6}
+ vstmia r12, {q15} @ save last round key
+ veor q7, q7, q6 @ fix up round 0 key
+ vstmia r4, {q7}
+#else
+ ldr r12, [r10, #244]
+ eors r12, #1
+ beq 0f
+
+ str r12, [r10, #244]
+ mov r4, r10 @ pass key
+ mov r5, r1 @ pass # of rounds
+ add r12, r10, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, r10, #248
+ vldmia r4, {q6}
+ vstmia r12, {q15} @ save last round key
+ veor q7, q7, q6 @ fix up round 0 key
+ vstmia r4, {q7}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
+ vld1.8 {q8}, [r0] @ initial tweak
+ adr r2, .Lxts_magic
+
+ tst r9, #0xf @ if not multiple of 16
+ it ne @ Thumb2 thing, sanity check in ARM
+ subne r9, #0x10 @ subtract another 16 bytes
+ subs r9, #0x80
+
+ blo .Lxts_dec_short
+ b .Lxts_dec_loop
+
+.align 4
+.Lxts_dec_loop:
+ vldmia r2, {q5} @ load XTS magic
+ vshr.s64 q6, q8, #63
+ mov r0, sp
+ vand q6, q6, q5
+ vadd.u64 q9, q8, q8
+ vst1.64 {q8}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q9, #63
+ veor q9, q9, q6
+ vand q7, q7, q5
+ vadd.u64 q10, q9, q9
+ vst1.64 {q9}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q10, #63
+ veor q10, q10, q7
+ vand q6, q6, q5
+ vld1.8 {q0}, [r7]!
+ vadd.u64 q11, q10, q10
+ vst1.64 {q10}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q11, #63
+ veor q11, q11, q6
+ vand q7, q7, q5
+ vld1.8 {q1}, [r7]!
+ veor q0, q0, q8
+ vadd.u64 q12, q11, q11
+ vst1.64 {q11}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q12, #63
+ veor q12, q12, q7
+ vand q6, q6, q5
+ vld1.8 {q2}, [r7]!
+ veor q1, q1, q9
+ vadd.u64 q13, q12, q12
+ vst1.64 {q12}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q13, #63
+ veor q13, q13, q6
+ vand q7, q7, q5
+ vld1.8 {q3}, [r7]!
+ veor q2, q2, q10
+ vadd.u64 q14, q13, q13
+ vst1.64 {q13}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q14, #63
+ veor q14, q14, q7
+ vand q6, q6, q5
+ vld1.8 {q4}, [r7]!
+ veor q3, q3, q11
+ vadd.u64 q15, q14, q14
+ vst1.64 {q14}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q15, #63
+ veor q15, q15, q6
+ vand q7, q7, q5
+ vld1.8 {q5}, [r7]!
+ veor q4, q4, q12
+ vadd.u64 q8, q15, q15
+ vst1.64 {q15}, [r0,:128]!
+ vswp d15,d14
+ veor q8, q8, q7
+ vst1.64 {q8}, [r0,:128] @ next round tweak
+
+ vld1.8 {q6-q7}, [r7]!
+ veor q5, q5, q13
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q6, q6, q14
+ mov r5, r1 @ pass rounds
+ veor q7, q7, q15
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12-q13}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q6, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q4, q11
+ vld1.64 {q14-q15}, [r0,:128]!
+ veor q10, q2, q12
+ vst1.8 {q8-q9}, [r8]!
+ veor q11, q7, q13
+ veor q12, q3, q14
+ vst1.8 {q10-q11}, [r8]!
+ veor q13, q5, q15
+ vst1.8 {q12-q13}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+
+ subs r9, #0x80
+ bpl .Lxts_dec_loop
+
+.Lxts_dec_short:
+ adds r9, #0x70
+ bmi .Lxts_dec_done
+
+ vldmia r2, {q5} @ load XTS magic
+ vshr.s64 q7, q8, #63
+ mov r0, sp
+ vand q7, q7, q5
+ vadd.u64 q9, q8, q8
+ vst1.64 {q8}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q9, #63
+ veor q9, q9, q7
+ vand q6, q6, q5
+ vadd.u64 q10, q9, q9
+ vst1.64 {q9}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q10, #63
+ veor q10, q10, q6
+ vand q7, q7, q5
+ vld1.8 {q0}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_dec_1
+ vadd.u64 q11, q10, q10
+ vst1.64 {q10}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q11, #63
+ veor q11, q11, q7
+ vand q6, q6, q5
+ vld1.8 {q1}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_dec_2
+ veor q0, q0, q8
+ vadd.u64 q12, q11, q11
+ vst1.64 {q11}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q12, #63
+ veor q12, q12, q6
+ vand q7, q7, q5
+ vld1.8 {q2}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_dec_3
+ veor q1, q1, q9
+ vadd.u64 q13, q12, q12
+ vst1.64 {q12}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q13, #63
+ veor q13, q13, q7
+ vand q6, q6, q5
+ vld1.8 {q3}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_dec_4
+ veor q2, q2, q10
+ vadd.u64 q14, q13, q13
+ vst1.64 {q13}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q14, #63
+ veor q14, q14, q6
+ vand q7, q7, q5
+ vld1.8 {q4}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_dec_5
+ veor q3, q3, q11
+ vadd.u64 q15, q14, q14
+ vst1.64 {q14}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q15, #63
+ veor q15, q15, q7
+ vand q6, q6, q5
+ vld1.8 {q5}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_dec_6
+ veor q4, q4, q12
+ sub r9, #0x10
+ vst1.64 {q15}, [r0,:128] @ next round tweak
+
+ vld1.8 {q6}, [r7]!
+ veor q5, q5, q13
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q6, q6, q14
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12-q13}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q6, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q4, q11
+ vld1.64 {q14}, [r0,:128]!
+ veor q10, q2, q12
+ vst1.8 {q8-q9}, [r8]!
+ veor q11, q7, q13
+ veor q12, q3, q14
+ vst1.8 {q10-q11}, [r8]!
+ vst1.8 {q12}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_6:
+ vst1.64 {q14}, [r0,:128] @ next round tweak
+
+ veor q4, q4, q12
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q5, q5, q13
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12-q13}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q6, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q4, q11
+ veor q10, q2, q12
+ vst1.8 {q8-q9}, [r8]!
+ veor q11, q7, q13
+ vst1.8 {q10-q11}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_5:
+ vst1.64 {q13}, [r0,:128] @ next round tweak
+
+ veor q3, q3, q11
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q4, q4, q12
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q6, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q4, q11
+ veor q10, q2, q12
+ vst1.8 {q8-q9}, [r8]!
+ vst1.8 {q10}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_4:
+ vst1.64 {q12}, [r0,:128] @ next round tweak
+
+ veor q2, q2, q10
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q3, q3, q11
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ veor q1, q1, q9
+ veor q8, q6, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q4, q11
+ vst1.8 {q8-q9}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_3:
+ vst1.64 {q11}, [r0,:128] @ next round tweak
+
+ veor q1, q1, q9
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q2, q2, q10
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10}, [r0,:128]!
+ veor q0, q0, q8
+ veor q1, q1, q9
+ veor q8, q6, q10
+ vst1.8 {q0-q1}, [r8]!
+ vst1.8 {q8}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_2:
+ vst1.64 {q10}, [r0,:128] @ next round tweak
+
+ veor q0, q0, q8
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q1, q1, q9
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ veor q0, q0, q8
+ veor q1, q1, q9
+ vst1.8 {q0-q1}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_1:
+ mov r0, sp
+ veor q0, q8
+ mov r1, sp
+ vst1.8 {q0}, [sp,:128]
+ mov r2, r10
+ mov r4, r3 @ preserve fp
+ mov r5, r2 @ preserve magic
+
+ bl AES_decrypt
+
+ vld1.8 {q0}, [sp,:128]
+ veor q0, q0, q8
+ vst1.8 {q0}, [r8]!
+ mov r3, r4
+ mov r2, r5
+
+ vmov q8, q9 @ next round tweak
+
+.Lxts_dec_done:
+#ifndef XTS_CHAIN_TWEAK
+ adds r9, #0x10
+ beq .Lxts_dec_ret
+
+ @ calculate one round of extra tweak for the stolen ciphertext
+ vldmia r2, {q5}
+ vshr.s64 q6, q8, #63
+ vand q6, q6, q5
+ vadd.u64 q9, q8, q8
+ vswp d13,d12
+ veor q9, q9, q6
+
+ @ perform the final decryption with the last tweak value
+ vld1.8 {q0}, [r7]!
+ mov r0, sp
+ veor q0, q0, q9
+ mov r1, sp
+ vst1.8 {q0}, [sp,:128]
+ mov r2, r10
+ mov r4, r3 @ preserve fp
+
+ bl AES_decrypt
+
+ vld1.8 {q0}, [sp,:128]
+ veor q0, q0, q9
+ vst1.8 {q0}, [r8]
+
+ mov r6, r8
+.Lxts_dec_steal:
+ ldrb r1, [r8]
+ ldrb r0, [r7], #1
+ strb r1, [r8, #0x10]
+ strb r0, [r8], #1
+
+ subs r9, #1
+ bhi .Lxts_dec_steal
+
+ vld1.8 {q0}, [r6]
+ mov r0, sp
+ veor q0, q8
+ mov r1, sp
+ vst1.8 {q0}, [sp,:128]
+ mov r2, r10
+
+ bl AES_decrypt
+
+ vld1.8 {q0}, [sp,:128]
+ veor q0, q0, q8
+ vst1.8 {q0}, [r6]
+ mov r3, r4
+#endif
+
+.Lxts_dec_ret:
+ bic r0, r3, #0xf
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
+.Lxts_dec_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r0
+ bne .Lxts_dec_bzero
+
+ mov sp, r3
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {q8}, [r1]
+#endif
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
+#endif
--- /dev/null
+/*
+ * linux/arch/arm/crypto/aesbs-glue.c - glue code for NEON bit sliced AES
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <crypto/aes.h>
+#include <crypto/ablk_helper.h>
+#include <crypto/algapi.h>
+#include <linux/module.h>
+
+#include "aes_glue.h"
+
+#define BIT_SLICED_KEY_MAXSIZE (128 * (AES_MAXNR - 1) + 2 * AES_BLOCK_SIZE)
+
+struct BS_KEY {
+ struct AES_KEY rk;
+ int converted;
+ u8 __aligned(8) bs[BIT_SLICED_KEY_MAXSIZE];
+} __aligned(8);
+
+asmlinkage void bsaes_enc_key_convert(u8 out[], struct AES_KEY const *in);
+asmlinkage void bsaes_dec_key_convert(u8 out[], struct AES_KEY const *in);
+
+asmlinkage void bsaes_cbc_encrypt(u8 const in[], u8 out[], u32 bytes,
+ struct BS_KEY *key, u8 iv[]);
+
+asmlinkage void bsaes_ctr32_encrypt_blocks(u8 const in[], u8 out[], u32 blocks,
+ struct BS_KEY *key, u8 const iv[]);
+
+asmlinkage void bsaes_xts_encrypt(u8 const in[], u8 out[], u32 bytes,
+ struct BS_KEY *key, u8 tweak[]);
+
+asmlinkage void bsaes_xts_decrypt(u8 const in[], u8 out[], u32 bytes,
+ struct BS_KEY *key, u8 tweak[]);
+
+struct aesbs_cbc_ctx {
+ struct AES_KEY enc;
+ struct BS_KEY dec;
+};
+
+struct aesbs_ctr_ctx {
+ struct BS_KEY enc;
+};
+
+struct aesbs_xts_ctx {
+ struct BS_KEY enc;
+ struct BS_KEY dec;
+ struct AES_KEY twkey;
+};
+
+static int aesbs_cbc_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
+ int bits = key_len * 8;
+
+ if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc)) {
+ tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+ ctx->dec.rk = ctx->enc;
+ private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
+ ctx->dec.converted = 0;
+ return 0;
+}
+
+static int aesbs_ctr_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct aesbs_ctr_ctx *ctx = crypto_tfm_ctx(tfm);
+ int bits = key_len * 8;
+
+ if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
+ tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+ ctx->enc.converted = 0;
+ return 0;
+}
+
+static int aesbs_xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+ int bits = key_len * 4;
+
+ if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
+ tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+ ctx->dec.rk = ctx->enc.rk;
+ private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
+ private_AES_set_encrypt_key(in_key + key_len / 2, bits, &ctx->twkey);
+ ctx->enc.converted = ctx->dec.converted = 0;
+ return 0;
+}
+
+static int aesbs_cbc_encrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ while (walk.nbytes) {
+ u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
+ u8 *src = walk.src.virt.addr;
+
+ if (walk.dst.virt.addr == walk.src.virt.addr) {
+ u8 *iv = walk.iv;
+
+ do {
+ crypto_xor(src, iv, AES_BLOCK_SIZE);
+ AES_encrypt(src, src, &ctx->enc);
+ iv = src;
+ src += AES_BLOCK_SIZE;
+ } while (--blocks);
+ memcpy(walk.iv, iv, AES_BLOCK_SIZE);
+ } else {
+ u8 *dst = walk.dst.virt.addr;
+
+ do {
+ crypto_xor(walk.iv, src, AES_BLOCK_SIZE);
+ AES_encrypt(walk.iv, dst, &ctx->enc);
+ memcpy(walk.iv, dst, AES_BLOCK_SIZE);
+ src += AES_BLOCK_SIZE;
+ dst += AES_BLOCK_SIZE;
+ } while (--blocks);
+ }
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+ return err;
+}
+
+static int aesbs_cbc_decrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+ while ((walk.nbytes / AES_BLOCK_SIZE) >= 8) {
+ kernel_neon_begin();
+ bsaes_cbc_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
+ walk.nbytes, &ctx->dec, walk.iv);
+ kernel_neon_end();
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+ while (walk.nbytes) {
+ u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
+ u8 *dst = walk.dst.virt.addr;
+ u8 *src = walk.src.virt.addr;
+ u8 bk[2][AES_BLOCK_SIZE];
+ u8 *iv = walk.iv;
+
+ do {
+ if (walk.dst.virt.addr == walk.src.virt.addr)
+ memcpy(bk[blocks & 1], src, AES_BLOCK_SIZE);
+
+ AES_decrypt(src, dst, &ctx->dec.rk);
+ crypto_xor(dst, iv, AES_BLOCK_SIZE);
+
+ if (walk.dst.virt.addr == walk.src.virt.addr)
+ iv = bk[blocks & 1];
+ else
+ iv = src;
+
+ dst += AES_BLOCK_SIZE;
+ src += AES_BLOCK_SIZE;
+ } while (--blocks);
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+ return err;
+}
+
+static void inc_be128_ctr(__be32 ctr[], u32 addend)
+{
+ int i;
+
+ for (i = 3; i >= 0; i--, addend = 1) {
+ u32 n = be32_to_cpu(ctr[i]) + addend;
+
+ ctr[i] = cpu_to_be32(n);
+ if (n >= addend)
+ break;
+ }
+}
+
+static int aesbs_ctr_encrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst, struct scatterlist *src,
+ unsigned int nbytes)
+{
+ struct aesbs_ctr_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ u32 blocks;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+ while ((blocks = walk.nbytes / AES_BLOCK_SIZE)) {
+ u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+ __be32 *ctr = (__be32 *)walk.iv;
+ u32 headroom = UINT_MAX - be32_to_cpu(ctr[3]);
+
+ /* avoid 32 bit counter overflow in the NEON code */
+ if (unlikely(headroom < blocks)) {
+ blocks = headroom + 1;
+ tail = walk.nbytes - blocks * AES_BLOCK_SIZE;
+ }
+ kernel_neon_begin();
+ bsaes_ctr32_encrypt_blocks(walk.src.virt.addr,
+ walk.dst.virt.addr, blocks,
+ &ctx->enc, walk.iv);
+ kernel_neon_end();
+ inc_be128_ctr(ctr, blocks);
+
+ nbytes -= blocks * AES_BLOCK_SIZE;
+ if (nbytes && nbytes == tail && nbytes <= AES_BLOCK_SIZE)
+ break;
+
+ err = blkcipher_walk_done(desc, &walk, tail);
+ }
+ if (walk.nbytes) {
+ u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
+ u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
+ u8 ks[AES_BLOCK_SIZE];
+
+ AES_encrypt(walk.iv, ks, &ctx->enc.rk);
+ if (tdst != tsrc)
+ memcpy(tdst, tsrc, nbytes);
+ crypto_xor(tdst, ks, nbytes);
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+ return err;
+}
+
+static int aesbs_xts_encrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+ /* generate the initial tweak */
+ AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
+
+ while (walk.nbytes) {
+ kernel_neon_begin();
+ bsaes_xts_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
+ walk.nbytes, &ctx->enc, walk.iv);
+ kernel_neon_end();
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+ return err;
+}
+
+static int aesbs_xts_decrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+ /* generate the initial tweak */
+ AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
+
+ while (walk.nbytes) {
+ kernel_neon_begin();
+ bsaes_xts_decrypt(walk.src.virt.addr, walk.dst.virt.addr,
+ walk.nbytes, &ctx->dec, walk.iv);
+ kernel_neon_end();
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+ return err;
+}
+
+static struct crypto_alg aesbs_algs[] = { {
+ .cra_name = "__cbc-aes-neonbs",
+ .cra_driver_name = "__driver-cbc-aes-neonbs",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct aesbs_cbc_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_blkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesbs_cbc_set_key,
+ .encrypt = aesbs_cbc_encrypt,
+ .decrypt = aesbs_cbc_decrypt,
+ },
+}, {
+ .cra_name = "__ctr-aes-neonbs",
+ .cra_driver_name = "__driver-ctr-aes-neonbs",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct aesbs_ctr_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_blkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesbs_ctr_set_key,
+ .encrypt = aesbs_ctr_encrypt,
+ .decrypt = aesbs_ctr_encrypt,
+ },
+}, {
+ .cra_name = "__xts-aes-neonbs",
+ .cra_driver_name = "__driver-xts-aes-neonbs",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct aesbs_xts_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_blkcipher = {
+ .min_keysize = 2 * AES_MIN_KEY_SIZE,
+ .max_keysize = 2 * AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesbs_xts_set_key,
+ .encrypt = aesbs_xts_encrypt,
+ .decrypt = aesbs_xts_decrypt,
+ },
+}, {
+ .cra_name = "cbc(aes)",
+ .cra_driver_name = "cbc-aes-neonbs",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_init = ablk_init,
+ .cra_exit = ablk_exit,
+ .cra_ablkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = __ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ }
+}, {
+ .cra_name = "ctr(aes)",
+ .cra_driver_name = "ctr-aes-neonbs",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_init = ablk_init,
+ .cra_exit = ablk_exit,
+ .cra_ablkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ }
+}, {
+ .cra_name = "xts(aes)",
+ .cra_driver_name = "xts-aes-neonbs",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_init = ablk_init,
+ .cra_exit = ablk_exit,
+ .cra_ablkcipher = {
+ .min_keysize = 2 * AES_MIN_KEY_SIZE,
+ .max_keysize = 2 * AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ }
+} };
+
+static int __init aesbs_mod_init(void)
+{
+ if (!cpu_has_neon())
+ return -ENODEV;
+
+ return crypto_register_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
+}
+
+static void __exit aesbs_mod_exit(void)
+{
+ crypto_unregister_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
+}
+
+module_init(aesbs_mod_init);
+module_exit(aesbs_mod_exit);
+
+MODULE_DESCRIPTION("Bit sliced AES in CBC/CTR/XTS modes using NEON");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL");
--- /dev/null
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
+# granted.
+# ====================================================================
+
+# Bit-sliced AES for ARM NEON
+#
+# February 2012.
+#
+# This implementation is direct adaptation of bsaes-x86_64 module for
+# ARM NEON. Except that this module is endian-neutral [in sense that
+# it can be compiled for either endianness] by courtesy of vld1.8's
+# neutrality. Initial version doesn't implement interface to OpenSSL,
+# only low-level primitives and unsupported entry points, just enough
+# to collect performance results, which for Cortex-A8 core are:
+#
+# encrypt 19.5 cycles per byte processed with 128-bit key
+# decrypt 22.1 cycles per byte processed with 128-bit key
+# key conv. 440 cycles per 128-bit key/0.18 of 8x block
+#
+# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+# which is [much] worse than anticipated (for further details see
+# http://www.openssl.org/~appro/Snapdragon-S4.html).
+#
+# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+# manages in 20.0 cycles].
+#
+# When comparing to x86_64 results keep in mind that NEON unit is
+# [mostly] single-issue and thus can't [fully] benefit from
+# instruction-level parallelism. And when comparing to aes-armv4
+# results keep in mind key schedule conversion overhead (see
+# bsaes-x86_64.pl for further details)...
+#
+# <appro@openssl.org>
+
+# April-August 2013
+#
+# Add CBC, CTR and XTS subroutines, adapt for kernel use.
+#
+# <ard.biesheuvel@linaro.org>
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
+my @XMM=map("q$_",(0..15));
+
+{
+my ($key,$rounds,$const)=("r4","r5","r6");
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+
+sub Sbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InBasisChange (@b);
+ &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
+ &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
+}
+
+sub InBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ veor @b[2], @b[2], @b[1]
+ veor @b[5], @b[5], @b[6]
+ veor @b[3], @b[3], @b[0]
+ veor @b[6], @b[6], @b[2]
+ veor @b[5], @b[5], @b[0]
+
+ veor @b[6], @b[6], @b[3]
+ veor @b[3], @b[3], @b[7]
+ veor @b[7], @b[7], @b[5]
+ veor @b[3], @b[3], @b[4]
+ veor @b[4], @b[4], @b[5]
+
+ veor @b[2], @b[2], @b[7]
+ veor @b[3], @b[3], @b[1]
+ veor @b[1], @b[1], @b[5]
+___
+}
+
+sub OutBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ veor @b[0], @b[0], @b[6]
+ veor @b[1], @b[1], @b[4]
+ veor @b[4], @b[4], @b[6]
+ veor @b[2], @b[2], @b[0]
+ veor @b[6], @b[6], @b[1]
+
+ veor @b[1], @b[1], @b[5]
+ veor @b[5], @b[5], @b[3]
+ veor @b[3], @b[3], @b[7]
+ veor @b[7], @b[7], @b[5]
+ veor @b[2], @b[2], @b[5]
+
+ veor @b[4], @b[4], @b[7]
+___
+}
+
+sub InvSbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InvInBasisChange (@b);
+ &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
+ &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
+}
+
+sub InvInBasisChange { # OutBasisChange in reverse (with twist)
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+ veor @b[1], @b[1], @b[7]
+ veor @b[4], @b[4], @b[7]
+
+ veor @b[7], @b[7], @b[5]
+ veor @b[1], @b[1], @b[3]
+ veor @b[2], @b[2], @b[5]
+ veor @b[3], @b[3], @b[7]
+
+ veor @b[6], @b[6], @b[1]
+ veor @b[2], @b[2], @b[0]
+ veor @b[5], @b[5], @b[3]
+ veor @b[4], @b[4], @b[6]
+ veor @b[0], @b[0], @b[6]
+ veor @b[1], @b[1], @b[4]
+___
+}
+
+sub InvOutBasisChange { # InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+ veor @b[1], @b[1], @b[5]
+ veor @b[2], @b[2], @b[7]
+
+ veor @b[3], @b[3], @b[1]
+ veor @b[4], @b[4], @b[5]
+ veor @b[7], @b[7], @b[5]
+ veor @b[3], @b[3], @b[4]
+ veor @b[5], @b[5], @b[0]
+ veor @b[3], @b[3], @b[7]
+ veor @b[6], @b[6], @b[2]
+ veor @b[2], @b[2], @b[1]
+ veor @b[6], @b[6], @b[3]
+
+ veor @b[3], @b[3], @b[0]
+ veor @b[5], @b[5], @b[6]
+___
+}
+
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
+$code.=<<___;
+ veor $t0, $y0, $y1
+ vand $t0, $t0, $x0
+ veor $x0, $x0, $x1
+ vand $t1, $x1, $y0
+ vand $x0, $x0, $y1
+ veor $x1, $t1, $t0
+ veor $x0, $x0, $t1
+___
+}
+
+sub Mul_GF4_N { # not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+ veor $t0, $y0, $y1
+ vand $t0, $t0, $x0
+ veor $x0, $x0, $x1
+ vand $x1, $x1, $y0
+ vand $x0, $x0, $y1
+ veor $x1, $x1, $x0
+ veor $x0, $x0, $t0
+___
+}
+
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+ $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+ veor $t0, $y0, $y1
+ veor $t1, $y2, $y3
+ vand $t0, $t0, $x0
+ vand $t1, $t1, $x2
+ veor $x0, $x0, $x1
+ veor $x2, $x2, $x3
+ vand $x1, $x1, $y0
+ vand $x3, $x3, $y2
+ vand $x0, $x0, $y1
+ vand $x2, $x2, $y3
+ veor $x1, $x1, $x0
+ veor $x2, $x2, $x3
+ veor $x0, $x0, $t0
+ veor $x3, $x3, $t1
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+ veor @t[0], @x[0], @x[2]
+ veor @t[1], @x[1], @x[3]
+___
+ &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+ veor @y[0], @y[0], @y[2]
+ veor @y[1], @y[1], @y[3]
+___
+ Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ veor @x[0], @x[0], @t[0]
+ veor @x[2], @x[2], @t[0]
+ veor @x[1], @x[1], @t[1]
+ veor @x[3], @x[3], @t[1]
+
+ veor @t[0], @x[4], @x[6]
+ veor @t[1], @x[5], @x[7]
+___
+ &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ veor @y[0], @y[0], @y[2]
+ veor @y[1], @y[1], @y[3]
+___
+ &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+ veor @x[4], @x[4], @t[0]
+ veor @x[6], @x[6], @t[0]
+ veor @x[5], @x[5], @t[1]
+ veor @x[7], @x[7], @t[1]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+ veor @t[3], @x[4], @x[6]
+ veor @t[2], @x[5], @x[7]
+ veor @t[1], @x[1], @x[3]
+ veor @s[1], @x[7], @x[6]
+ vmov @t[0], @t[2]
+ veor @s[0], @x[0], @x[2]
+
+ vorr @t[2], @t[2], @t[1]
+ veor @s[3], @t[3], @t[0]
+ vand @s[2], @t[3], @s[0]
+ vorr @t[3], @t[3], @s[0]
+ veor @s[0], @s[0], @t[1]
+ vand @t[0], @t[0], @t[1]
+ veor @t[1], @x[3], @x[2]
+ vand @s[3], @s[3], @s[0]
+ vand @s[1], @s[1], @t[1]
+ veor @t[1], @x[4], @x[5]
+ veor @s[0], @x[1], @x[0]
+ veor @t[3], @t[3], @s[1]
+ veor @t[2], @t[2], @s[1]
+ vand @s[1], @t[1], @s[0]
+ vorr @t[1], @t[1], @s[0]
+ veor @t[3], @t[3], @s[3]
+ veor @t[0], @t[0], @s[1]
+ veor @t[2], @t[2], @s[2]
+ veor @t[1], @t[1], @s[3]
+ veor @t[0], @t[0], @s[2]
+ vand @s[0], @x[7], @x[3]
+ veor @t[1], @t[1], @s[2]
+ vand @s[1], @x[6], @x[2]
+ vand @s[2], @x[5], @x[1]
+ vorr @s[3], @x[4], @x[0]
+ veor @t[3], @t[3], @s[0]
+ veor @t[1], @t[1], @s[2]
+ veor @t[0], @t[0], @s[3]
+ veor @t[2], @t[2], @s[1]
+
+ @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+
+ @ new smaller inversion
+
+ vand @s[2], @t[3], @t[1]
+ vmov @s[0], @t[0]
+
+ veor @s[1], @t[2], @s[2]
+ veor @s[3], @t[0], @s[2]
+ veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
+
+ vbsl @s[1], @t[1], @t[0]
+ vbsl @s[3], @t[3], @t[2]
+ veor @t[3], @t[3], @t[2]
+
+ vbsl @s[0], @s[1], @s[2]
+ vbsl @t[0], @s[2], @s[1]
+
+ vand @s[2], @s[0], @s[3]
+ veor @t[1], @t[1], @t[0]
+
+ veor @s[2], @s[2], @t[3]
+___
+# output in s3, s2, s1, t1
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+ &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+
+# AES linear components
+
+sub ShiftRows {
+my @x=@_[0..7];
+my @t=@_[8..11];
+my $mask=pop;
+$code.=<<___;
+ vldmia $key!, {@t[0]-@t[3]}
+ veor @t[0], @t[0], @x[0]
+ veor @t[1], @t[1], @x[1]
+ vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
+ vldmia $key!, {@t[0]}
+ veor @t[2], @t[2], @x[2]
+ vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
+ vldmia $key!, {@t[1]}
+ veor @t[3], @t[3], @x[3]
+ vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
+ vldmia $key!, {@t[2]}
+ vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
+ vldmia $key!, {@t[3]}
+ veor @t[0], @t[0], @x[4]
+ veor @t[1], @t[1], @x[5]
+ vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
+ veor @t[2], @t[2], @x[6]
+ vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
+ veor @t[3], @t[3], @x[7]
+ vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
+ vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
+___
+}
+
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+my $inv=@_[16]; # optional
+$code.=<<___;
+ vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
+ vext.8 @t[1], @x[1], @x[1], #12
+ veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
+ vext.8 @t[2], @x[2], @x[2], #12
+ veor @x[1], @x[1], @t[1]
+ vext.8 @t[3], @x[3], @x[3], #12
+ veor @x[2], @x[2], @t[2]
+ vext.8 @t[4], @x[4], @x[4], #12
+ veor @x[3], @x[3], @t[3]
+ vext.8 @t[5], @x[5], @x[5], #12
+ veor @x[4], @x[4], @t[4]
+ vext.8 @t[6], @x[6], @x[6], #12
+ veor @x[5], @x[5], @t[5]
+ vext.8 @t[7], @x[7], @x[7], #12
+ veor @x[6], @x[6], @t[6]
+
+ veor @t[1], @t[1], @x[0]
+ veor @x[7], @x[7], @t[7]
+ vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
+ veor @t[2], @t[2], @x[1]
+ veor @t[0], @t[0], @x[7]
+ veor @t[1], @t[1], @x[7]
+ vext.8 @x[1], @x[1], @x[1], #8
+ veor @t[5], @t[5], @x[4]
+ veor @x[0], @x[0], @t[0]
+ veor @t[6], @t[6], @x[5]
+ veor @x[1], @x[1], @t[1]
+ vext.8 @t[0], @x[4], @x[4], #8
+ veor @t[4], @t[4], @x[3]
+ vext.8 @t[1], @x[5], @x[5], #8
+ veor @t[7], @t[7], @x[6]
+ vext.8 @x[4], @x[3], @x[3], #8
+ veor @t[3], @t[3], @x[2]
+ vext.8 @x[5], @x[7], @x[7], #8
+ veor @t[4], @t[4], @x[7]
+ vext.8 @x[3], @x[6], @x[6], #8
+ veor @t[3], @t[3], @x[7]
+ vext.8 @x[6], @x[2], @x[2], #8
+ veor @x[7], @t[1], @t[5]
+___
+$code.=<<___ if (!$inv);
+ veor @x[2], @t[0], @t[4]
+ veor @x[4], @x[4], @t[3]
+ veor @x[5], @x[5], @t[7]
+ veor @x[3], @x[3], @t[6]
+ @ vmov @x[2], @t[0]
+ veor @x[6], @x[6], @t[2]
+ @ vmov @x[7], @t[1]
+___
+$code.=<<___ if ($inv);
+ veor @t[3], @t[3], @x[4]
+ veor @x[5], @x[5], @t[7]
+ veor @x[2], @x[3], @t[6]
+ veor @x[3], @t[0], @t[4]
+ veor @x[4], @x[6], @t[2]
+ vmov @x[6], @t[3]
+ @ vmov @x[7], @t[1]
+___
+}
+
+sub InvMixColumns_orig {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+$code.=<<___;
+ @ multiplication by 0x0e
+ vext.8 @t[7], @x[7], @x[7], #12
+ vmov @t[2], @x[2]
+ veor @x[2], @x[2], @x[5] @ 2 5
+ veor @x[7], @x[7], @x[5] @ 7 5
+ vext.8 @t[0], @x[0], @x[0], #12
+ vmov @t[5], @x[5]
+ veor @x[5], @x[5], @x[0] @ 5 0 [1]
+ veor @x[0], @x[0], @x[1] @ 0 1
+ vext.8 @t[1], @x[1], @x[1], #12
+ veor @x[1], @x[1], @x[2] @ 1 25
+ veor @x[0], @x[0], @x[6] @ 01 6 [2]
+ vext.8 @t[3], @x[3], @x[3], #12
+ veor @x[1], @x[1], @x[3] @ 125 3 [4]
+ veor @x[2], @x[2], @x[0] @ 25 016 [3]
+ veor @x[3], @x[3], @x[7] @ 3 75
+ veor @x[7], @x[7], @x[6] @ 75 6 [0]
+ vext.8 @t[6], @x[6], @x[6], #12
+ vmov @t[4], @x[4]
+ veor @x[6], @x[6], @x[4] @ 6 4
+ veor @x[4], @x[4], @x[3] @ 4 375 [6]
+ veor @x[3], @x[3], @x[7] @ 375 756=36
+ veor @x[6], @x[6], @t[5] @ 64 5 [7]
+ veor @x[3], @x[3], @t[2] @ 36 2
+ vext.8 @t[5], @t[5], @t[5], #12
+ veor @x[3], @x[3], @t[4] @ 362 4 [5]
+___
+ my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+ @ multiplication by 0x0b
+ veor @y[1], @y[1], @y[0]
+ veor @y[0], @y[0], @t[0]
+ vext.8 @t[2], @t[2], @t[2], #12
+ veor @y[1], @y[1], @t[1]
+ veor @y[0], @y[0], @t[5]
+ vext.8 @t[4], @t[4], @t[4], #12
+ veor @y[1], @y[1], @t[6]
+ veor @y[0], @y[0], @t[7]
+ veor @t[7], @t[7], @t[6] @ clobber t[7]
+
+ veor @y[3], @y[3], @t[0]
+ veor @y[1], @y[1], @y[0]
+ vext.8 @t[0], @t[0], @t[0], #12
+ veor @y[2], @y[2], @t[1]
+ veor @y[4], @y[4], @t[1]
+ vext.8 @t[1], @t[1], @t[1], #12
+ veor @y[2], @y[2], @t[2]
+ veor @y[3], @y[3], @t[2]
+ veor @y[5], @y[5], @t[2]
+ veor @y[2], @y[2], @t[7]
+ vext.8 @t[2], @t[2], @t[2], #12
+ veor @y[3], @y[3], @t[3]
+ veor @y[6], @y[6], @t[3]
+ veor @y[4], @y[4], @t[3]
+ veor @y[7], @y[7], @t[4]
+ vext.8 @t[3], @t[3], @t[3], #12
+ veor @y[5], @y[5], @t[4]
+ veor @y[7], @y[7], @t[7]
+ veor @t[7], @t[7], @t[5] @ clobber t[7] even more
+ veor @y[3], @y[3], @t[5]
+ veor @y[4], @y[4], @t[4]
+
+ veor @y[5], @y[5], @t[7]
+ vext.8 @t[4], @t[4], @t[4], #12
+ veor @y[6], @y[6], @t[7]
+ veor @y[4], @y[4], @t[7]
+
+ veor @t[7], @t[7], @t[5]
+ vext.8 @t[5], @t[5], @t[5], #12
+
+ @ multiplication by 0x0d
+ veor @y[4], @y[4], @y[7]
+ veor @t[7], @t[7], @t[6] @ restore t[7]
+ veor @y[7], @y[7], @t[4]
+ vext.8 @t[6], @t[6], @t[6], #12
+ veor @y[2], @y[2], @t[0]
+ veor @y[7], @y[7], @t[5]
+ vext.8 @t[7], @t[7], @t[7], #12
+ veor @y[2], @y[2], @t[2]
+
+ veor @y[3], @y[3], @y[1]
+ veor @y[1], @y[1], @t[1]
+ veor @y[0], @y[0], @t[0]
+ veor @y[3], @y[3], @t[0]
+ veor @y[1], @y[1], @t[5]
+ veor @y[0], @y[0], @t[5]
+ vext.8 @t[0], @t[0], @t[0], #12
+ veor @y[1], @y[1], @t[7]
+ veor @y[0], @y[0], @t[6]
+ veor @y[3], @y[3], @y[1]
+ veor @y[4], @y[4], @t[1]
+ vext.8 @t[1], @t[1], @t[1], #12
+
+ veor @y[7], @y[7], @t[7]
+ veor @y[4], @y[4], @t[2]
+ veor @y[5], @y[5], @t[2]
+ veor @y[2], @y[2], @t[6]
+ veor @t[6], @t[6], @t[3] @ clobber t[6]
+ vext.8 @t[2], @t[2], @t[2], #12
+ veor @y[4], @y[4], @y[7]
+ veor @y[3], @y[3], @t[6]
+
+ veor @y[6], @y[6], @t[6]
+ veor @y[5], @y[5], @t[5]
+ vext.8 @t[5], @t[5], @t[5], #12
+ veor @y[6], @y[6], @t[4]
+ vext.8 @t[4], @t[4], @t[4], #12
+ veor @y[5], @y[5], @t[6]
+ veor @y[6], @y[6], @t[7]
+ vext.8 @t[7], @t[7], @t[7], #12
+ veor @t[6], @t[6], @t[3] @ restore t[6]
+ vext.8 @t[3], @t[3], @t[3], #12
+
+ @ multiplication by 0x09
+ veor @y[4], @y[4], @y[1]
+ veor @t[1], @t[1], @y[1] @ t[1]=y[1]
+ veor @t[0], @t[0], @t[5] @ clobber t[0]
+ vext.8 @t[6], @t[6], @t[6], #12
+ veor @t[1], @t[1], @t[5]
+ veor @y[3], @y[3], @t[0]
+ veor @t[0], @t[0], @y[0] @ t[0]=y[0]
+ veor @t[1], @t[1], @t[6]
+ veor @t[6], @t[6], @t[7] @ clobber t[6]
+ veor @y[4], @y[4], @t[1]
+ veor @y[7], @y[7], @t[4]
+ veor @y[6], @y[6], @t[3]
+ veor @y[5], @y[5], @t[2]
+ veor @t[4], @t[4], @y[4] @ t[4]=y[4]
+ veor @t[3], @t[3], @y[3] @ t[3]=y[3]
+ veor @t[5], @t[5], @y[5] @ t[5]=y[5]
+ veor @t[2], @t[2], @y[2] @ t[2]=y[2]
+ veor @t[3], @t[3], @t[7]
+ veor @XMM[5], @t[5], @t[6]
+ veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
+ veor @XMM[2], @t[2], @t[6]
+ veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
+
+ vmov @XMM[0], @t[0]
+ vmov @XMM[1], @t[1]
+ @ vmov @XMM[2], @t[2]
+ vmov @XMM[3], @t[3]
+ vmov @XMM[4], @t[4]
+ @ vmov @XMM[5], @t[5]
+ @ vmov @XMM[6], @t[6]
+ @ vmov @XMM[7], @t[7]
+___
+}
+
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+# Thanks to Jussi Kivilinna for providing pointer to
+#
+# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
+# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
+
+$code.=<<___;
+ @ multiplication by 0x05-0x00-0x04-0x00
+ vext.8 @t[0], @x[0], @x[0], #8
+ vext.8 @t[6], @x[6], @x[6], #8
+ vext.8 @t[7], @x[7], @x[7], #8
+ veor @t[0], @t[0], @x[0]
+ vext.8 @t[1], @x[1], @x[1], #8
+ veor @t[6], @t[6], @x[6]
+ vext.8 @t[2], @x[2], @x[2], #8
+ veor @t[7], @t[7], @x[7]
+ vext.8 @t[3], @x[3], @x[3], #8
+ veor @t[1], @t[1], @x[1]
+ vext.8 @t[4], @x[4], @x[4], #8
+ veor @t[2], @t[2], @x[2]
+ vext.8 @t[5], @x[5], @x[5], #8
+ veor @t[3], @t[3], @x[3]
+ veor @t[4], @t[4], @x[4]
+ veor @t[5], @t[5], @x[5]
+
+ veor @x[0], @x[0], @t[6]
+ veor @x[1], @x[1], @t[6]
+ veor @x[2], @x[2], @t[0]
+ veor @x[4], @x[4], @t[2]
+ veor @x[3], @x[3], @t[1]
+ veor @x[1], @x[1], @t[7]
+ veor @x[2], @x[2], @t[7]
+ veor @x[4], @x[4], @t[6]
+ veor @x[5], @x[5], @t[3]
+ veor @x[3], @x[3], @t[6]
+ veor @x[6], @x[6], @t[4]
+ veor @x[4], @x[4], @t[7]
+ veor @x[5], @x[5], @t[7]
+ veor @x[7], @x[7], @t[5]
+___
+ &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
+}
+
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+ vshr.u64 $t, $b, #$n
+ veor $t, $t, $a
+ vand $t, $t, $mask
+ veor $a, $a, $t
+ vshl.u64 $t, $t, #$n
+ veor $b, $b, $t
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+ vshr.u64 $t0, $b0, #$n
+ vshr.u64 $t1, $b1, #$n
+ veor $t0, $t0, $a0
+ veor $t1, $t1, $a1
+ vand $t0, $t0, $mask
+ vand $t1, $t1, $mask
+ veor $a0, $a0, $t0
+ vshl.u64 $t0, $t0, #$n
+ veor $a1, $a1, $t1
+ vshl.u64 $t1, $t1, #$n
+ veor $b0, $b0, $t0
+ veor $b1, $b1, $t1
+___
+}
+
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+ vmov.i8 $t0,#0x55 @ compose .LBS0
+ vmov.i8 $t1,#0x33 @ compose .LBS1
+___
+ &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+ &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+ vmov.i8 $t0,#0x0f @ compose .LBS2
+___
+ &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+
+ &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+ &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+# define VFP_ABI_FRAME 0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME 0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_ARCH__>=7
+.text
+.syntax unified @ ARMv7-capable assembler is expected to handle this
+#ifdef __thumb2__
+.thumb
+#else
+.code 32
+#endif
+
+.fpu neon
+
+.type _bsaes_decrypt8,%function
+.align 4
+_bsaes_decrypt8:
+ adr $const,_bsaes_decrypt8
+ vldmia $key!, {@XMM[9]} @ round 0 key
+ add $const,$const,#.LM0ISR-_bsaes_decrypt8
+
+ vldmia $const!, {@XMM[8]} @ .LM0ISR
+ veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
+ veor @XMM[11], @XMM[1], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ veor @XMM[12], @XMM[2], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+ veor @XMM[13], @XMM[3], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+ veor @XMM[14], @XMM[4], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+ veor @XMM[15], @XMM[5], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+ veor @XMM[10], @XMM[6], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+ veor @XMM[11], @XMM[7], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ sub $rounds,$rounds,#1
+ b .Ldec_sbox
+.align 4
+.Ldec_loop:
+___
+ &ShiftRows (@XMM[0..7, 8..12]);
+$code.=".Ldec_sbox:\n";
+ &InvSbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ subs $rounds,$rounds,#1
+ bcc .Ldec_done
+___
+ &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+ vldmia $const, {@XMM[12]} @ .LISR
+ ite eq @ Thumb2 thing, sanity check in ARM
+ addeq $const,$const,#0x10
+ bne .Ldec_loop
+ vldmia $const, {@XMM[12]} @ .LISRM0
+ b .Ldec_loop
+.align 4
+.Ldec_done:
+___
+ &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+ vldmia $key, {@XMM[8]} @ last round key
+ veor @XMM[6], @XMM[6], @XMM[8]
+ veor @XMM[4], @XMM[4], @XMM[8]
+ veor @XMM[2], @XMM[2], @XMM[8]
+ veor @XMM[7], @XMM[7], @XMM[8]
+ veor @XMM[3], @XMM[3], @XMM[8]
+ veor @XMM[5], @XMM[5], @XMM[8]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ veor @XMM[1], @XMM[1], @XMM[8]
+ bx lr
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type _bsaes_const,%object
+.align 6
+_bsaes_const:
+.LM0ISR: @ InvShiftRows constants
+ .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+ .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+ .quad 0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR: @ ShiftRows constants
+ .quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+ .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+ .quad 0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+ .quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+ .quad 0x090d01050c000408, 0x03070b0f060a0e02
+.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 6
+.size _bsaes_const,.-_bsaes_const
+
+.type _bsaes_encrypt8,%function
+.align 4
+_bsaes_encrypt8:
+ adr $const,_bsaes_encrypt8
+ vldmia $key!, {@XMM[9]} @ round 0 key
+ sub $const,$const,#_bsaes_encrypt8-.LM0SR
+
+ vldmia $const!, {@XMM[8]} @ .LM0SR
+_bsaes_encrypt8_alt:
+ veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
+ veor @XMM[11], @XMM[1], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ veor @XMM[12], @XMM[2], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+ veor @XMM[13], @XMM[3], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+ veor @XMM[14], @XMM[4], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+ veor @XMM[15], @XMM[5], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+ veor @XMM[10], @XMM[6], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+ veor @XMM[11], @XMM[7], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+_bsaes_encrypt8_bitslice:
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ sub $rounds,$rounds,#1
+ b .Lenc_sbox
+.align 4
+.Lenc_loop:
+___
+ &ShiftRows (@XMM[0..7, 8..12]);
+$code.=".Lenc_sbox:\n";
+ &Sbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ subs $rounds,$rounds,#1
+ bcc .Lenc_done
+___
+ &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+ vldmia $const, {@XMM[12]} @ .LSR
+ ite eq @ Thumb2 thing, samity check in ARM
+ addeq $const,$const,#0x10
+ bne .Lenc_loop
+ vldmia $const, {@XMM[12]} @ .LSRM0
+ b .Lenc_loop
+.align 4
+.Lenc_done:
+___
+ # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+ &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+ vldmia $key, {@XMM[8]} @ last round key
+ veor @XMM[4], @XMM[4], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[8]
+ veor @XMM[3], @XMM[3], @XMM[8]
+ veor @XMM[7], @XMM[7], @XMM[8]
+ veor @XMM[2], @XMM[2], @XMM[8]
+ veor @XMM[5], @XMM[5], @XMM[8]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ veor @XMM[1], @XMM[1], @XMM[8]
+ bx lr
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
+
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+
+ &swapmove (@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+ @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
+ vmov @x[2], @x[0]
+ vmov @x[3], @x[1]
+___
+ #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+
+ &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+ @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+ vmov @x[4], @x[0]
+ vmov @x[6], @x[2]
+ vmov @x[5], @x[1]
+ vmov @x[7], @x[3]
+___
+ &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
+ &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+
+$code.=<<___;
+.type _bsaes_key_convert,%function
+.align 4
+_bsaes_key_convert:
+ adr $const,_bsaes_key_convert
+ vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
+ sub $const,$const,#_bsaes_key_convert-.LM0
+ vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
+
+ vmov.i8 @XMM[8], #0x01 @ bit masks
+ vmov.i8 @XMM[9], #0x02
+ vmov.i8 @XMM[10], #0x04
+ vmov.i8 @XMM[11], #0x08
+ vmov.i8 @XMM[12], #0x10
+ vmov.i8 @XMM[13], #0x20
+ vldmia $const, {@XMM[14]} @ .LM0
+
+#ifdef __ARMEL__
+ vrev32.8 @XMM[7], @XMM[7]
+ vrev32.8 @XMM[15], @XMM[15]
+#endif
+ sub $rounds,$rounds,#1
+ vstmia $out!, {@XMM[7]} @ save round 0 key
+ b .Lkey_loop
+
+.align 4
+.Lkey_loop:
+ vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
+ vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
+ vmov.i8 @XMM[6], #0x40
+ vmov.i8 @XMM[15], #0x80
+
+ vtst.8 @XMM[0], @XMM[7], @XMM[8]
+ vtst.8 @XMM[1], @XMM[7], @XMM[9]
+ vtst.8 @XMM[2], @XMM[7], @XMM[10]
+ vtst.8 @XMM[3], @XMM[7], @XMM[11]
+ vtst.8 @XMM[4], @XMM[7], @XMM[12]
+ vtst.8 @XMM[5], @XMM[7], @XMM[13]
+ vtst.8 @XMM[6], @XMM[7], @XMM[6]
+ vtst.8 @XMM[7], @XMM[7], @XMM[15]
+ vld1.8 {@XMM[15]}, [$inp]! @ load next round key
+ vmvn @XMM[0], @XMM[0] @ "pnot"
+ vmvn @XMM[1], @XMM[1]
+ vmvn @XMM[5], @XMM[5]
+ vmvn @XMM[6], @XMM[6]
+#ifdef __ARMEL__
+ vrev32.8 @XMM[15], @XMM[15]
+#endif
+ subs $rounds,$rounds,#1
+ vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
+ bne .Lkey_loop
+
+ vmov.i8 @XMM[7],#0x63 @ compose .L63
+ @ don't save last round key
+ bx lr
+.size _bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+
+if (0) { # following four functions are unsupported interface
+ # used for benchmarking...
+$code.=<<___;
+.globl bsaes_enc_key_convert
+.type bsaes_enc_key_convert,%function
+.align 4
+bsaes_enc_key_convert:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ ldr r5,[$inp,#240] @ pass rounds
+ mov r4,$inp @ pass key
+ mov r12,$out @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
+
+.globl bsaes_encrypt_128
+.type bsaes_encrypt_128,%function
+.align 4
+bsaes_encrypt_128:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+.Lenc128_loop:
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
+ mov r4,$key @ pass the key
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
+ mov r5,#10 @ pass rounds
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+
+ bl _bsaes_encrypt8
+
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ subs $len,$len,#0x80
+ vst1.8 {@XMM[5]}, [$out]!
+ bhi .Lenc128_loop
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_encrypt_128,.-bsaes_encrypt_128
+
+.globl bsaes_dec_key_convert
+.type bsaes_dec_key_convert,%function
+.align 4
+bsaes_dec_key_convert:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ ldr r5,[$inp,#240] @ pass rounds
+ mov r4,$inp @ pass key
+ mov r12,$out @ pass key schedule
+ bl _bsaes_key_convert
+ vldmia $out, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia $out, {@XMM[7]}
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
+
+.globl bsaes_decrypt_128
+.type bsaes_decrypt_128,%function
+.align 4
+bsaes_decrypt_128:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+.Ldec128_loop:
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
+ mov r4,$key @ pass the key
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
+ mov r5,#10 @ pass rounds
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+
+ bl _bsaes_decrypt8
+
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ subs $len,$len,#0x80
+ vst1.8 {@XMM[5]}, [$out]!
+ bhi .Ldec128_loop
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
+my ($keysched)=("sp");
+
+$code.=<<___;
+.extern AES_cbc_encrypt
+.extern AES_decrypt
+
+.global bsaes_cbc_encrypt
+.type bsaes_cbc_encrypt,%function
+.align 5
+bsaes_cbc_encrypt:
+#ifndef __KERNEL__
+ cmp $len, #128
+#ifndef __thumb__
+ blo AES_cbc_encrypt
+#else
+ bhs 1f
+ b AES_cbc_encrypt
+1:
+#endif
+#endif
+
+ @ it is up to the caller to make sure we are called with enc == 0
+
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr}
+ VFP_ABI_PUSH
+ ldr $ivp, [ip] @ IV is 1st arg on the stack
+ mov $len, $len, lsr#4 @ len in 16 byte blocks
+ sub sp, #0x10 @ scratch space to carry over the IV
+ mov $fp, sp @ save sp
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ add r12, #`128-32` @ sifze of bit-slices key schedule
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12 @ sp is $keysched
+ bl _bsaes_key_convert
+ vldmia $keysched, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia $keysched, {@XMM[7]}
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, $key, #248
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+
+.align 2
+0:
+#endif
+
+ vld1.8 {@XMM[15]}, [$ivp] @ load IV
+ b .Lcbc_dec_loop
+
+.align 4
+.Lcbc_dec_loop:
+ subs $len, $len, #0x8
+ bmi .Lcbc_dec_loop_finish
+
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, $keysched @ pass the key
+#else
+ add r4, $key, #248
+#endif
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
+ mov r5, $rounds
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
+ sub $inp, $inp, #0x60
+ vstmia $fp, {@XMM[15]} @ put aside IV
+
+ bl _bsaes_decrypt8
+
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
+ veor @XMM[7], @XMM[7], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[3], @XMM[3], @XMM[13]
+ vst1.8 {@XMM[6]}, [$out]!
+ veor @XMM[5], @XMM[5], @XMM[14]
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ vst1.8 {@XMM[5]}, [$out]!
+
+ b .Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+ adds $len, $len, #8
+ beq .Lcbc_dec_done
+
+ vld1.8 {@XMM[0]}, [$inp]! @ load input
+ cmp $len, #2
+ blo .Lcbc_dec_one
+ vld1.8 {@XMM[1]}, [$inp]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, $keysched @ pass the key
+#else
+ add r4, $key, #248
+#endif
+ mov r5, $rounds
+ vstmia $fp, {@XMM[15]} @ put aside IV
+ beq .Lcbc_dec_two
+ vld1.8 {@XMM[2]}, [$inp]!
+ cmp $len, #4
+ blo .Lcbc_dec_three
+ vld1.8 {@XMM[3]}, [$inp]!
+ beq .Lcbc_dec_four
+ vld1.8 {@XMM[4]}, [$inp]!
+ cmp $len, #6
+ blo .Lcbc_dec_five
+ vld1.8 {@XMM[5]}, [$inp]!
+ beq .Lcbc_dec_six
+ vld1.8 {@XMM[6]}, [$inp]!
+ sub $inp, $inp, #0x70
+
+ bl _bsaes_decrypt8
+
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[7], @XMM[7], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[3], @XMM[3], @XMM[13]
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_six:
+ sub $inp, $inp, #0x60
+ bl _bsaes_decrypt8
+ vldmia $fp,{@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[12]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[7], @XMM[7], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_five:
+ sub $inp, $inp, #0x50
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_four:
+ sub $inp, $inp, #0x40
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_three:
+ sub $inp, $inp, #0x30
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_two:
+ sub $inp, $inp, #0x20
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[15]}, [$inp]! @ reload input
+ veor @XMM[1], @XMM[1], @XMM[8]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_one:
+ sub $inp, $inp, #0x10
+ mov $rounds, $out @ save original out pointer
+ mov $out, $fp @ use the iv scratch space as out buffer
+ mov r2, $key
+ vmov @XMM[4],@XMM[15] @ just in case ensure that IV
+ vmov @XMM[5],@XMM[0] @ and input are preserved
+ bl AES_decrypt
+ vld1.8 {@XMM[0]}, [$fp,:64] @ load result
+ veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
+ vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
+ vst1.8 {@XMM[0]}, [$rounds] @ write output
+
+.Lcbc_dec_done:
+#ifndef BSAES_ASM_EXTENDED_KEY
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+.Lcbc_dec_bzero: @ wipe key schedule [if any]
+ vstmia $keysched!, {q0-q1}
+ cmp $keysched, $fp
+ bne .Lcbc_dec_bzero
+#endif
+
+ mov sp, $fp
+ add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
+ vst1.8 {@XMM[15]}, [$ivp] @ return IV
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc}
+.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+___
+}
+{
+my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
+my $const = "r6"; # shared with _bsaes_encrypt8_alt
+my $keysched = "sp";
+
+$code.=<<___;
+.extern AES_encrypt
+.global bsaes_ctr32_encrypt_blocks
+.type bsaes_ctr32_encrypt_blocks,%function
+.align 5
+bsaes_ctr32_encrypt_blocks:
+ cmp $len, #8 @ use plain AES for
+ blo .Lctr_enc_short @ small sizes
+
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr}
+ VFP_ABI_PUSH
+ ldr $ctr, [ip] @ ctr is 1st arg on the stack
+ sub sp, sp, #0x10 @ scratch space to carry over the ctr
+ mov $fp, sp @ save sp
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ add r12, #`128-32` @ size of bit-sliced key schedule
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12 @ sp is $keysched
+ bl _bsaes_key_convert
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+
+ vld1.8 {@XMM[0]}, [$ctr] @ load counter
+ add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
+ vldmia $keysched, {@XMM[4]} @ load round0 key
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+
+.align 2
+0: add r12, $key, #248
+ vld1.8 {@XMM[0]}, [$ctr] @ load counter
+ adrl $ctr, .LREVM0SR @ borrow $ctr
+ vldmia r12, {@XMM[4]} @ load round0 key
+ sub sp, #0x10 @ place for adjusted round0 key
+#endif
+
+ vmov.i32 @XMM[8],#1 @ compose 1<<96
+ veor @XMM[9],@XMM[9],@XMM[9]
+ vrev32.8 @XMM[0],@XMM[0]
+ vext.8 @XMM[8],@XMM[9],@XMM[8],#4
+ vrev32.8 @XMM[4],@XMM[4]
+ vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
+ vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
+ b .Lctr_enc_loop
+
+.align 4
+.Lctr_enc_loop:
+ vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96
+ vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1
+ vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2
+ vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3
+ vadd.u32 @XMM[4], @XMM[1], @XMM[10]
+ vadd.u32 @XMM[5], @XMM[2], @XMM[10]
+ vadd.u32 @XMM[6], @XMM[3], @XMM[10]
+ vadd.u32 @XMM[7], @XMM[4], @XMM[10]
+ vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter
+
+ @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+ @ to flip byte order in 32-bit counter
+
+ vldmia $keysched, {@XMM[9]} @ load round0 key
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, $keysched, #0x10 @ pass next round key
+#else
+ add r4, $key, #`248+16`
+#endif
+ vldmia $ctr, {@XMM[8]} @ .LREVM0SR
+ mov r5, $rounds @ pass rounds
+ vstmia $fp, {@XMM[10]} @ save next counter
+ sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
+
+ bl _bsaes_encrypt8_alt
+
+ subs $len, $len, #8
+ blo .Lctr_enc_loop_done
+
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[0], @XMM[8]
+ veor @XMM[1], @XMM[9]
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
+ veor @XMM[4], @XMM[10]
+ veor @XMM[6], @XMM[11]
+ vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
+ veor @XMM[3], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[7], @XMM[13]
+ veor @XMM[2], @XMM[14]
+ vst1.8 {@XMM[4]}, [$out]!
+ veor @XMM[5], @XMM[15]
+ vst1.8 {@XMM[6]}, [$out]!
+ vmov.i32 @XMM[8], #1 @ compose 1<<96
+ vst1.8 {@XMM[3]}, [$out]!
+ veor @XMM[9], @XMM[9], @XMM[9]
+ vst1.8 {@XMM[7]}, [$out]!
+ vext.8 @XMM[8], @XMM[9], @XMM[8], #4
+ vst1.8 {@XMM[2]}, [$out]!
+ vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
+ vst1.8 {@XMM[5]}, [$out]!
+ vldmia $fp, {@XMM[0]} @ load counter
+
+ bne .Lctr_enc_loop
+ b .Lctr_enc_done
+
+.align 4
+.Lctr_enc_loop_done:
+ add $len, $len, #8
+ vld1.8 {@XMM[8]}, [$inp]! @ load input
+ veor @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [$out]! @ write output
+ cmp $len, #2
+ blo .Lctr_enc_done
+ vld1.8 {@XMM[9]}, [$inp]!
+ veor @XMM[1], @XMM[9]
+ vst1.8 {@XMM[1]}, [$out]!
+ beq .Lctr_enc_done
+ vld1.8 {@XMM[10]}, [$inp]!
+ veor @XMM[4], @XMM[10]
+ vst1.8 {@XMM[4]}, [$out]!
+ cmp $len, #4
+ blo .Lctr_enc_done
+ vld1.8 {@XMM[11]}, [$inp]!
+ veor @XMM[6], @XMM[11]
+ vst1.8 {@XMM[6]}, [$out]!
+ beq .Lctr_enc_done
+ vld1.8 {@XMM[12]}, [$inp]!
+ veor @XMM[3], @XMM[12]
+ vst1.8 {@XMM[3]}, [$out]!
+ cmp $len, #6
+ blo .Lctr_enc_done
+ vld1.8 {@XMM[13]}, [$inp]!
+ veor @XMM[7], @XMM[13]
+ vst1.8 {@XMM[7]}, [$out]!
+ beq .Lctr_enc_done
+ vld1.8 {@XMM[14]}, [$inp]
+ veor @XMM[2], @XMM[14]
+ vst1.8 {@XMM[2]}, [$out]!
+
+.Lctr_enc_done:
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifndef BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero: @ wipe key schedule [if any]
+ vstmia $keysched!, {q0-q1}
+ cmp $keysched, $fp
+ bne .Lctr_enc_bzero
+#else
+ vstmia $keysched, {q0-q1}
+#endif
+
+ mov sp, $fp
+ add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.align 4
+.Lctr_enc_short:
+ ldr ip, [sp] @ ctr pointer is passed on stack
+ stmdb sp!, {r4-r8, lr}
+
+ mov r4, $inp @ copy arguments
+ mov r5, $out
+ mov r6, $len
+ mov r7, $key
+ ldr r8, [ip, #12] @ load counter LSW
+ vld1.8 {@XMM[1]}, [ip] @ load whole counter value
+#ifdef __ARMEL__
+ rev r8, r8
+#endif
+ sub sp, sp, #0x10
+ vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value
+ sub sp, sp, #0x10
+
+.Lctr_enc_short_loop:
+ add r0, sp, #0x10 @ input counter value
+ mov r1, sp @ output on the stack
+ mov r2, r7 @ key
+
+ bl AES_encrypt
+
+ vld1.8 {@XMM[0]}, [r4]! @ load input
+ vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter
+ add r8, r8, #1
+#ifdef __ARMEL__
+ rev r0, r8
+ str r0, [sp, #0x1c] @ next counter value
+#else
+ str r8, [sp, #0x1c] @ next counter value
+#endif
+ veor @XMM[0],@XMM[0],@XMM[1]
+ vst1.8 {@XMM[0]}, [r5]! @ store output
+ subs r6, r6, #1
+ bne .Lctr_enc_short_loop
+
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+ vstmia sp!, {q0-q1}
+
+ ldmia sp!, {r4-r8, pc}
+.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+}
+{
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2,
+# const unsigned char iv[16]);
+#
+my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
+my $const="r6"; # returned by _bsaes_key_convert
+my $twmask=@XMM[5];
+my @T=@XMM[6..7];
+
+$code.=<<___;
+.globl bsaes_xts_encrypt
+.type bsaes_xts_encrypt,%function
+.align 4
+bsaes_xts_encrypt:
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr} @ 0x20
+ VFP_ABI_PUSH
+ mov r6, sp @ future $fp
+
+ mov $inp, r0
+ mov $out, r1
+ mov $len, r2
+ mov $key, r3
+
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
+ @ generate initial tweak
+ ldr r0, [ip, #4] @ iv[]
+ mov r1, sp
+ ldr r2, [ip, #0] @ key2
+ bl AES_encrypt
+ mov r0,sp @ pointer to initial tweak
+#endif
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+ mov $fp, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #`128-32` @ size of bit-sliced key schedule
+ sub r12, #`32+16` @ place for tweak[9]
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
+
+ vld1.8 {@XMM[8]}, [r0] @ initial tweak
+ adr $magic, .Lxts_magic
+
+ subs $len, #0x80
+ blo .Lxts_enc_short
+ b .Lxts_enc_loop
+
+.align 4
+.Lxts_enc_loop:
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ vadd.u64 @XMM[8], @XMM[15], @XMM[15]
+ vst1.64 {@XMM[15]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ veor @XMM[8], @XMM[8], @T[0]
+ vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ veor @XMM[7], @XMM[7], @XMM[15]
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[2], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ veor @XMM[13], @XMM[5], @XMM[15]
+ vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ subs $len, #0x80
+ bpl .Lxts_enc_loop
+
+.Lxts_enc_short:
+ adds $len, #0x70
+ bmi .Lxts_enc_done
+
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+ subs $len, #0x10
+ bmi .Lxts_enc_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ sub $len, #0x10
+ vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ vld1.64 {@XMM[14]}, [r0,:128]!
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[2], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ vst1.8 {@XMM[12]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_6:
+ vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
+
+ veor @XMM[4], @XMM[4], @XMM[12]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[5], @XMM[5], @XMM[13]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+
+@ put this in range for both ARM and Thumb mode adr instructions
+.align 5
+.Lxts_magic:
+ .quad 1, 0x87
+
+.align 5
+.Lxts_enc_5:
+ vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
+
+ veor @XMM[3], @XMM[3], @XMM[11]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[4], @XMM[4], @XMM[12]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ vst1.8 {@XMM[10]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_4:
+ vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
+
+ veor @XMM[2], @XMM[2], @XMM[10]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[3], @XMM[3], @XMM[11]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_3:
+ vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
+
+ veor @XMM[1], @XMM[1], @XMM[9]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[2], @XMM[2], @XMM[10]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ vld1.64 {@XMM[10]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ vst1.8 {@XMM[8]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_2:
+ vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
+
+ veor @XMM[0], @XMM[0], @XMM[8]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[1], @XMM[1], @XMM[9]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_1:
+ mov r0, sp
+ veor @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+
+ bl AES_encrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [$out]!
+ mov $fp, r4
+
+ vmov @XMM[8], @XMM[9] @ next round tweak
+
+.Lxts_enc_done:
+#ifndef XTS_CHAIN_TWEAK
+ adds $len, #0x10
+ beq .Lxts_enc_ret
+ sub r6, $out, #0x10
+
+.Lxts_enc_steal:
+ ldrb r0, [$inp], #1
+ ldrb r1, [$out, #-0x10]
+ strb r0, [$out, #-0x10]
+ strb r1, [$out], #1
+
+ subs $len, #1
+ bhi .Lxts_enc_steal
+
+ vld1.8 {@XMM[0]}, [r6]
+ mov r0, sp
+ veor @XMM[0], @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+
+ bl AES_encrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [r6]
+ mov $fp, r4
+#endif
+
+.Lxts_enc_ret:
+ bic r0, $fp, #0xf
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
+.Lxts_enc_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r0
+ bne .Lxts_enc_bzero
+
+ mov sp, $fp
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {@XMM[8]}, [r1]
+#endif
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type bsaes_xts_decrypt,%function
+.align 4
+bsaes_xts_decrypt:
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr} @ 0x20
+ VFP_ABI_PUSH
+ mov r6, sp @ future $fp
+
+ mov $inp, r0
+ mov $out, r1
+ mov $len, r2
+ mov $key, r3
+
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
+ @ generate initial tweak
+ ldr r0, [ip, #4] @ iv[]
+ mov r1, sp
+ ldr r2, [ip, #0] @ key2
+ bl AES_encrypt
+ mov r0, sp @ pointer to initial tweak
+#endif
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+ mov $fp, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #`128-32` @ size of bit-sliced key schedule
+ sub r12, #`32+16` @ place for tweak[9]
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, sp, #0x90
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, $key, #248
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
+ vld1.8 {@XMM[8]}, [r0] @ initial tweak
+ adr $magic, .Lxts_magic
+
+ tst $len, #0xf @ if not multiple of 16
+ it ne @ Thumb2 thing, sanity check in ARM
+ subne $len, #0x10 @ subtract another 16 bytes
+ subs $len, #0x80
+
+ blo .Lxts_dec_short
+ b .Lxts_dec_loop
+
+.align 4
+.Lxts_dec_loop:
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ vadd.u64 @XMM[8], @XMM[15], @XMM[15]
+ vst1.64 {@XMM[15]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ veor @XMM[8], @XMM[8], @T[0]
+ vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ veor @XMM[7], @XMM[7], @XMM[15]
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[3], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ veor @XMM[13], @XMM[5], @XMM[15]
+ vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ subs $len, #0x80
+ bpl .Lxts_dec_loop
+
+.Lxts_dec_short:
+ adds $len, #0x70
+ bmi .Lxts_dec_done
+
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+ subs $len, #0x10
+ bmi .Lxts_dec_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ sub $len, #0x10
+ vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ vld1.64 {@XMM[14]}, [r0,:128]!
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[3], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ vst1.8 {@XMM[12]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_6:
+ vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
+
+ veor @XMM[4], @XMM[4], @XMM[12]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[5], @XMM[5], @XMM[13]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_5:
+ vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
+
+ veor @XMM[3], @XMM[3], @XMM[11]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[4], @XMM[4], @XMM[12]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ vst1.8 {@XMM[10]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_4:
+ vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
+
+ veor @XMM[2], @XMM[2], @XMM[10]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[3], @XMM[3], @XMM[11]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_3:
+ vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
+
+ veor @XMM[1], @XMM[1], @XMM[9]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[2], @XMM[2], @XMM[10]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ vld1.64 {@XMM[10]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ vst1.8 {@XMM[8]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_2:
+ vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
+
+ veor @XMM[0], @XMM[0], @XMM[8]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[1], @XMM[1], @XMM[9]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_1:
+ mov r0, sp
+ veor @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+ mov r5, $magic @ preserve magic
+
+ bl AES_decrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [$out]!
+ mov $fp, r4
+ mov $magic, r5
+
+ vmov @XMM[8], @XMM[9] @ next round tweak
+
+.Lxts_dec_done:
+#ifndef XTS_CHAIN_TWEAK
+ adds $len, #0x10
+ beq .Lxts_dec_ret
+
+ @ calculate one round of extra tweak for the stolen ciphertext
+ vldmia $magic, {$twmask}
+ vshr.s64 @XMM[6], @XMM[8], #63
+ vand @XMM[6], @XMM[6], $twmask
+ vadd.u64 @XMM[9], @XMM[8], @XMM[8]
+ vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
+ veor @XMM[9], @XMM[9], @XMM[6]
+
+ @ perform the final decryption with the last tweak value
+ vld1.8 {@XMM[0]}, [$inp]!
+ mov r0, sp
+ veor @XMM[0], @XMM[0], @XMM[9]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+
+ bl AES_decrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[9]
+ vst1.8 {@XMM[0]}, [$out]
+
+ mov r6, $out
+.Lxts_dec_steal:
+ ldrb r1, [$out]
+ ldrb r0, [$inp], #1
+ strb r1, [$out, #0x10]
+ strb r0, [$out], #1
+
+ subs $len, #1
+ bhi .Lxts_dec_steal
+
+ vld1.8 {@XMM[0]}, [r6]
+ mov r0, sp
+ veor @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+
+ bl AES_decrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [r6]
+ mov $fp, r4
+#endif
+
+.Lxts_dec_ret:
+ bic r0, $fp, #0xf
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
+.Lxts_dec_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r0
+ bne .Lxts_dec_bzero
+
+ mov sp, $fp
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {@XMM[8]}, [r1]
+#endif
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+print $code;
+
+close STDOUT;
generic-y += serial.h
generic-y += shmbuf.h
generic-y += siginfo.h
+generic-y += simd.h
generic-y += sizes.h
generic-y += socket.h
generic-y += sockios.h
#define __ASM_ARM_ATOMIC_H
#include <linux/compiler.h>
+#include <linux/prefetch.h>
#include <linux/types.h>
#include <linux/irqflags.h>
#include <asm/barrier.h>
unsigned long tmp;
int result;
+ prefetchw(&v->counter);
__asm__ __volatile__("@ atomic_add\n"
"1: ldrex %0, [%3]\n"
" add %0, %0, %4\n"
unsigned long tmp;
int result;
+ prefetchw(&v->counter);
__asm__ __volatile__("@ atomic_sub\n"
"1: ldrex %0, [%3]\n"
" sub %0, %0, %4\n"
{
unsigned long tmp, tmp2;
+ prefetchw(addr);
__asm__ __volatile__("@ atomic_clear_mask\n"
"1: ldrex %0, [%3]\n"
" bic %0, %0, %4\n"
{
u64 tmp;
+ prefetchw(&v->counter);
__asm__ __volatile__("@ atomic64_set\n"
"1: ldrexd %0, %H0, [%2]\n"
" strexd %0, %3, %H3, [%2]\n"
u64 result;
unsigned long tmp;
+ prefetchw(&v->counter);
__asm__ __volatile__("@ atomic64_add\n"
"1: ldrexd %0, %H0, [%3]\n"
" adds %0, %0, %4\n"
u64 result;
unsigned long tmp;
+ prefetchw(&v->counter);
__asm__ __volatile__("@ atomic64_sub\n"
"1: ldrexd %0, %H0, [%3]\n"
" subs %0, %0, %4\n"
--- /dev/null
+/*
+ * arch/arm/include/asm/bL_switcher.h
+ *
+ * Created by: Nicolas Pitre, April 2012
+ * Copyright: (C) 2012-2013 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef ASM_BL_SWITCHER_H
+#define ASM_BL_SWITCHER_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+typedef void (*bL_switch_completion_handler)(void *cookie);
+
+int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id,
+ bL_switch_completion_handler completer,
+ void *completer_cookie);
+static inline int bL_switch_request(unsigned int cpu, unsigned int new_cluster_id)
+{
+ return bL_switch_request_cb(cpu, new_cluster_id, NULL, NULL);
+}
+
+/*
+ * Register here to be notified about runtime enabling/disabling of
+ * the switcher.
+ *
+ * The notifier chain is called with the switcher activation lock held:
+ * the switcher will not be enabled or disabled during callbacks.
+ * Callbacks must not call bL_switcher_{get,put}_enabled().
+ */
+#define BL_NOTIFY_PRE_ENABLE 0
+#define BL_NOTIFY_POST_ENABLE 1
+#define BL_NOTIFY_PRE_DISABLE 2
+#define BL_NOTIFY_POST_DISABLE 3
+
+#ifdef CONFIG_BL_SWITCHER
+
+int bL_switcher_register_notifier(struct notifier_block *nb);
+int bL_switcher_unregister_notifier(struct notifier_block *nb);
+
+/*
+ * Use these functions to temporarily prevent enabling/disabling of
+ * the switcher.
+ * bL_switcher_get_enabled() returns true if the switcher is currently
+ * enabled. Each call to bL_switcher_get_enabled() must be followed
+ * by a call to bL_switcher_put_enabled(). These functions are not
+ * recursive.
+ */
+bool bL_switcher_get_enabled(void);
+void bL_switcher_put_enabled(void);
+
+int bL_switcher_trace_trigger(void);
+int bL_switcher_get_logical_index(u32 mpidr);
+
+#else
+static inline int bL_switcher_register_notifier(struct notifier_block *nb)
+{
+ return 0;
+}
+
+static inline int bL_switcher_unregister_notifier(struct notifier_block *nb)
+{
+ return 0;
+}
+
+static inline bool bL_switcher_get_enabled(void) { return false; }
+static inline void bL_switcher_put_enabled(void) { }
+static inline int bL_switcher_trace_trigger(void) { return 0; }
+static inline int bL_switcher_get_logical_index(u32 mpidr) { return -EUNATCH; }
+#endif /* CONFIG_BL_SWITCHER */
+
+#endif
return ret;
}
+static inline unsigned long long __cmpxchg64(unsigned long long *ptr,
+ unsigned long long old,
+ unsigned long long new)
+{
+ unsigned long long oldval;
+ unsigned long res;
+
+ __asm__ __volatile__(
+"1: ldrexd %1, %H1, [%3]\n"
+" teq %1, %4\n"
+" teqeq %H1, %H4\n"
+" bne 2f\n"
+" strexd %0, %5, %H5, [%3]\n"
+" teq %0, #0\n"
+" bne 1b\n"
+"2:"
+ : "=&r" (res), "=&r" (oldval), "+Qo" (*ptr)
+ : "r" (ptr), "r" (old), "r" (new)
+ : "cc");
+
+ return oldval;
+}
+
+static inline unsigned long long __cmpxchg64_mb(unsigned long long *ptr,
+ unsigned long long old,
+ unsigned long long new)
+{
+ unsigned long long ret;
+
+ smp_mb();
+ ret = __cmpxchg64(ptr, old, new);
+ smp_mb();
+
+ return ret;
+}
+
#define cmpxchg_local(ptr,o,n) \
((__typeof__(*(ptr)))__cmpxchg_local((ptr), \
(unsigned long)(o), \
sizeof(*(ptr))))
#define cmpxchg64(ptr, o, n) \
- ((__typeof__(*(ptr)))atomic64_cmpxchg(container_of((ptr), \
- atomic64_t, \
- counter), \
- (unsigned long long)(o), \
- (unsigned long long)(n)))
-
-#define cmpxchg64_local(ptr, o, n) \
- ((__typeof__(*(ptr)))local64_cmpxchg(container_of((ptr), \
- local64_t, \
- a), \
- (unsigned long long)(o), \
- (unsigned long long)(n)))
+ ((__typeof__(*(ptr)))__cmpxchg64_mb((ptr), \
+ (unsigned long long)(o), \
+ (unsigned long long)(n)))
+
+#define cmpxchg64_relaxed(ptr, o, n) \
+ ((__typeof__(*(ptr)))__cmpxchg64((ptr), \
+ (unsigned long long)(o), \
+ (unsigned long long)(n)))
+
+#define cmpxchg64_local(ptr, o, n) cmpxchg64_relaxed((ptr), (o), (n))
#endif /* __LINUX_ARM_ARCH__ >= 6 */
#define CPUID_TLBTYPE 3
#define CPUID_MPUIR 4
#define CPUID_MPIDR 5
+#define CPUID_REVIDR 6
#ifdef CONFIG_CPU_V7M
#define CPUID_EXT_PFR0 0x40
{
return (dma_addr_t)__virt_to_bus((unsigned long)(addr));
}
+
#else
static inline dma_addr_t pfn_to_dma(struct device *dev, unsigned long pfn)
{
}
#endif
+/* The ARM override for dma_max_pfn() */
+static inline unsigned long dma_max_pfn(struct device *dev)
+{
+ return PHYS_PFN_OFFSET + dma_to_pfn(dev, *dev->dma_mask);
+}
+#define dma_max_pfn(dev) dma_max_pfn(dev)
+
/*
* DMA errors are defined by all-bits-set in the DMA address.
*/
#include <linux/threads.h>
#include <asm/irq.h>
-#define NR_IPI 6
+#define NR_IPI 7
typedef struct {
unsigned int __softirq_pending;
bool (*smp_init)(void);
void (*fixup)(struct tag *, char **,
struct meminfo *);
+ void (*init_meminfo)(void);
void (*reserve)(void);/* reserve mem blocks */
void (*map_io)(void);/* IO mapping function */
void (*init_early)(void);
*/
void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr);
+/*
+ * This sets an early poke i.e a value to be poked into some address
+ * from very early assembly code before the CPU is ungated. The
+ * address must be physical, and if 0 then nothing will happen.
+ */
+void mcpm_set_early_poke(unsigned cpu, unsigned cluster,
+ unsigned long poke_phys_addr, unsigned long poke_val);
+
/*
* CPU/cluster power operations API for higher subsystems to use.
*/
* so that all we need to do is modify the 8-bit constant field.
*/
#define __PV_BITS_31_24 0x81000000
+#define __PV_BITS_7_0 0x81
+
+extern phys_addr_t (*arch_virt_to_idmap) (unsigned long x);
+extern u64 __pv_phys_offset;
+extern u64 __pv_offset;
+extern void fixup_pv_table(const void *, unsigned long);
+extern const void *__pv_table_begin, *__pv_table_end;
-extern unsigned long __pv_phys_offset;
#define PHYS_OFFSET __pv_phys_offset
#define __pv_stub(from,to,instr,type) \
: "=r" (to) \
: "r" (from), "I" (type))
-static inline unsigned long __virt_to_phys(unsigned long x)
+#define __pv_stub_mov_hi(t) \
+ __asm__ volatile("@ __pv_stub_mov\n" \
+ "1: mov %R0, %1\n" \
+ " .pushsection .pv_table,\"a\"\n" \
+ " .long 1b\n" \
+ " .popsection\n" \
+ : "=r" (t) \
+ : "I" (__PV_BITS_7_0))
+
+#define __pv_add_carry_stub(x, y) \
+ __asm__ volatile("@ __pv_add_carry_stub\n" \
+ "1: adds %Q0, %1, %2\n" \
+ " adc %R0, %R0, #0\n" \
+ " .pushsection .pv_table,\"a\"\n" \
+ " .long 1b\n" \
+ " .popsection\n" \
+ : "+r" (y) \
+ : "r" (x), "I" (__PV_BITS_31_24) \
+ : "cc")
+
+static inline phys_addr_t __virt_to_phys(unsigned long x)
{
- unsigned long t;
- __pv_stub(x, t, "add", __PV_BITS_31_24);
+ phys_addr_t t;
+
+ if (sizeof(phys_addr_t) == 4) {
+ __pv_stub(x, t, "add", __PV_BITS_31_24);
+ } else {
+ __pv_stub_mov_hi(t);
+ __pv_add_carry_stub(x, t);
+ }
return t;
}
-static inline unsigned long __phys_to_virt(unsigned long x)
+static inline unsigned long __phys_to_virt(phys_addr_t x)
{
unsigned long t;
__pv_stub(x, t, "sub", __PV_BITS_31_24);
return t;
}
+
#else
-#define __virt_to_phys(x) ((x) - PAGE_OFFSET + PHYS_OFFSET)
-#define __phys_to_virt(x) ((x) - PHYS_OFFSET + PAGE_OFFSET)
+
+static inline phys_addr_t __virt_to_phys(unsigned long x)
+{
+ return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET;
+}
+
+static inline unsigned long __phys_to_virt(phys_addr_t x)
+{
+ return x - PHYS_OFFSET + PAGE_OFFSET;
+}
+
#endif
#endif
#endif /* __ASSEMBLY__ */
static inline void *phys_to_virt(phys_addr_t x)
{
- return (void *)(__phys_to_virt((unsigned long)(x)));
+ return (void *)__phys_to_virt(x);
}
/*
* Drivers should NOT use these either.
*/
#define __pa(x) __virt_to_phys((unsigned long)(x))
-#define __va(x) ((void *)__phys_to_virt((unsigned long)(x)))
+#define __va(x) ((void *)__phys_to_virt((phys_addr_t)(x)))
#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
+/*
+ * These are for systems that have a hardware interconnect supported alias of
+ * physical memory for idmap purposes. Most cases should leave these
+ * untouched.
+ */
+static inline phys_addr_t __virt_to_idmap(unsigned long x)
+{
+ if (arch_virt_to_idmap)
+ return arch_virt_to_idmap(x);
+ else
+ return __virt_to_phys(x);
+}
+
+#define virt_to_idmap(x) __virt_to_idmap((unsigned long)(x))
+
/*
* Virtual <-> DMA view memory address translations
* Again, these are *only* valid on the kernel direct mapped RAM
#define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext)
+/*
+ * We don't have huge page support for short descriptors, for the moment
+ * define empty stubs for use by pin_page_for_write.
+ */
+#define pmd_hugewillfault(pmd) (0)
+#define pmd_thp_or_huge(pmd) (0)
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_PGTABLE_2LEVEL_H */
#define __HAVE_ARCH_PMD_WRITE
#define pmd_write(pmd) (!(pmd_val(pmd) & PMD_SECT_RDONLY))
+#define pmd_hugewillfault(pmd) (!pmd_young(pmd) || !pmd_write(pmd))
+#define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd))
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
#define pmd_trans_splitting(pmd) (pmd_val(pmd) & PMD_SECT_SPLITTING)
#include <asm/hw_breakpoint.h>
#include <asm/ptrace.h>
#include <asm/types.h>
+#include <asm/unified.h>
#ifdef __KERNEL__
#define STACK_TOP ((current->personality & ADDR_LIMIT_32BIT) ? \
#define KSTK_EIP(tsk) task_pt_regs(tsk)->ARM_pc
#define KSTK_ESP(tsk) task_pt_regs(tsk)->ARM_sp
+#ifdef CONFIG_SMP
+#define __ALT_SMP_ASM(smp, up) \
+ "9998: " smp "\n" \
+ " .pushsection \".alt.smp.init\", \"a\"\n" \
+ " .long 9998b\n" \
+ " " up "\n" \
+ " .popsection\n"
+#else
+#define __ALT_SMP_ASM(smp, up) up
+#endif
+
/*
* Prefetching support - only ARMv5.
*/
{
__asm__ __volatile__(
"pld\t%a0"
- :
- : "p" (ptr)
- : "cc");
+ :: "p" (ptr));
}
+#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
#define ARCH_HAS_PREFETCHW
-#define prefetchw(ptr) prefetch(ptr)
-
-#define ARCH_HAS_SPINLOCK_PREFETCH
-#define spin_lock_prefetch(x) do { } while (0)
-
+static inline void prefetchw(const void *ptr)
+{
+ __asm__ __volatile__(
+ ".arch_extension mp\n"
+ __ALT_SMP_ASM(
+ WASM(pldw) "\t%a0",
+ WASM(pld) "\t%a0"
+ )
+ :: "p" (ptr));
+}
+#endif
#endif
#define HAVE_ARCH_PICK_MMAP_LAYOUT
extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask);
+extern int register_ipi_completion(struct completion *completion, int cpu);
+
struct smp_operations {
#ifdef CONFIG_SMP
/*
#error SMP not supported on pre-ARMv6 CPUs
#endif
-#include <asm/processor.h>
+#include <linux/prefetch.h>
/*
* sev and wfe are ARMv6K extensions. Uniprocessor ARMv6 may not have the K
* extensions, so when running on UP, we have to patch these instructions away.
*/
-#define ALT_SMP(smp, up) \
- "9998: " smp "\n" \
- " .pushsection \".alt.smp.init\", \"a\"\n" \
- " .long 9998b\n" \
- " " up "\n" \
- " .popsection\n"
-
#ifdef CONFIG_THUMB2_KERNEL
-#define SEV ALT_SMP("sev.w", "nop.w")
/*
* For Thumb-2, special care is needed to ensure that the conditional WFE
* instruction really does assemble to exactly 4 bytes (as required by
* the assembler won't change IT instructions which are explicitly present
* in the input.
*/
-#define WFE(cond) ALT_SMP( \
+#define WFE(cond) __ALT_SMP_ASM( \
"it " cond "\n\t" \
"wfe" cond ".n", \
\
"nop.w" \
)
#else
-#define SEV ALT_SMP("sev", "nop")
-#define WFE(cond) ALT_SMP("wfe" cond, "nop")
+#define WFE(cond) __ALT_SMP_ASM("wfe" cond, "nop")
#endif
+#define SEV __ALT_SMP_ASM(WASM(sev), WASM(nop))
+
static inline void dsb_sev(void)
{
#if __LINUX_ARM_ARCH__ >= 7
u32 newval;
arch_spinlock_t lockval;
+ prefetchw(&lock->slock);
__asm__ __volatile__(
"1: ldrex %0, [%3]\n"
" add %1, %0, %4\n"
unsigned long contended, res;
u32 slock;
+ prefetchw(&lock->slock);
do {
__asm__ __volatile__(
" ldrex %0, [%3]\n"
dsb_sev();
}
+static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
+{
+ return lock.tickets.owner == lock.tickets.next;
+}
+
static inline int arch_spin_is_locked(arch_spinlock_t *lock)
{
- struct __raw_tickets tickets = ACCESS_ONCE(lock->tickets);
- return tickets.owner != tickets.next;
+ return !arch_spin_value_unlocked(ACCESS_ONCE(*lock));
}
static inline int arch_spin_is_contended(arch_spinlock_t *lock)
{
unsigned long tmp;
+ prefetchw(&rw->lock);
__asm__ __volatile__(
"1: ldrex %0, [%1]\n"
" teq %0, #0\n"
{
unsigned long contended, res;
+ prefetchw(&rw->lock);
do {
__asm__ __volatile__(
" ldrex %0, [%2]\n"
}
/* write_can_lock - would write_trylock() succeed? */
-#define arch_write_can_lock(x) ((x)->lock == 0)
+#define arch_write_can_lock(x) (ACCESS_ONCE((x)->lock) == 0)
/*
* Read locks are a bit more hairy:
{
unsigned long tmp, tmp2;
+ prefetchw(&rw->lock);
__asm__ __volatile__(
"1: ldrex %0, [%2]\n"
" adds %0, %0, #1\n"
smp_mb();
+ prefetchw(&rw->lock);
__asm__ __volatile__(
"1: ldrex %0, [%2]\n"
" sub %0, %0, #1\n"
{
unsigned long contended, res;
+ prefetchw(&rw->lock);
do {
__asm__ __volatile__(
" ldrex %0, [%2]\n"
}
/* read_can_lock - would read_trylock() succeed? */
-#define arch_read_can_lock(x) ((x)->lock < 0x80000000)
+#define arch_read_can_lock(x) (ACCESS_ONCE((x)->lock) < 0x80000000)
#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } }
typedef struct {
- volatile unsigned int lock;
+ u32 lock;
} arch_rwlock_t;
#define __ARCH_RW_LOCK_UNLOCKED { 0 }
asm("mcr p15, 0, %0, c7, c1, 6" : : "r" (zero));
}
-#include <asm/cputype.h>
-#ifdef CONFIG_ARM_ERRATA_798181
-static inline int erratum_a15_798181(void)
-{
- unsigned int midr = read_cpuid_id();
-
- /* Cortex-A15 r0p0..r3p2 affected */
- if ((midr & 0xff0ffff0) != 0x410fc0f0 || midr > 0x413fc0f2)
- return 0;
- return 1;
-}
-
-static inline void dummy_flush_tlb_a15_erratum(void)
-{
- /*
- * Dummy TLBIMVAIS. Using the unmapped address 0 and ASID 0.
- */
- asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (0));
- dsb(ish);
-}
-#else
-static inline int erratum_a15_798181(void)
-{
- return 0;
-}
-
-static inline void dummy_flush_tlb_a15_erratum(void)
-{
-}
-#endif
-
/*
* flush_pmd_entry
*
#endif
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_ARM_ERRATA_798181
+extern void erratum_a15_798181_init(void);
+#else
+static inline void erratum_a15_798181_init(void) {}
+#endif
+extern bool (*erratum_a15_798181_handler)(void);
+
+static inline bool erratum_a15_798181(void)
+{
+ if (unlikely(IS_ENABLED(CONFIG_ARM_ERRATA_798181) &&
+ erratum_a15_798181_handler))
+ return erratum_a15_798181_handler();
+ return false;
+}
+#endif
+
#endif
#ifdef __ASSEMBLY__
#define W(instr) instr.w
#define BSYM(sym) sym + 1
+#else
+#define WASM(instr) #instr ".w"
#endif
#else /* !CONFIG_THUMB2_KERNEL */
#ifdef __ASSEMBLY__
#define W(instr) instr
#define BSYM(sym) sym
+#else
+#define WASM(instr) #instr
#endif
#endif /* CONFIG_THUMB2_KERNEL */
--- /dev/null
+/*
+ * Copyright (C) 2013 Pengutronix
+ * Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define UARTn_CMD 0x000c
+#define UARTn_CMD_TXEN 0x0004
+
+#define UARTn_STATUS 0x0010
+#define UARTn_STATUS_TXC 0x0020
+#define UARTn_STATUS_TXBL 0x0040
+
+#define UARTn_TXDATA 0x0034
+
+ .macro addruart, rx, tmp
+ ldr \rx, =(CONFIG_DEBUG_UART_PHYS)
+
+ /*
+ * enable TX. The driver might disable it to save energy. We
+ * don't care about disabling at the end as during debug power
+ * consumption isn't that important.
+ */
+ ldr \tmp, =(UARTn_CMD_TXEN)
+ str \tmp, [\rx, #UARTn_CMD]
+ .endm
+
+ .macro senduart,rd,rx
+ strb \rd, [\rx, #UARTn_TXDATA]
+ .endm
+
+ .macro waituart,rd,rx
+1001: ldr \rd, [\rx, #UARTn_STATUS]
+ tst \rd, #UARTn_STATUS_TXBL
+ beq 1001b
+ .endm
+
+ .macro busyuart,rd,rx
+1001: ldr \rd, [\rx, UARTn_STATUS]
+ tst \rd, #UARTn_STATUS_TXC
+ bne 1001b
+ .endm
header-y += ioctls.h
header-y += kvm_para.h
header-y += mman.h
+header-y += perf_regs.h
header-y += posix_types.h
header-y += ptrace.h
header-y += setup.h
--- /dev/null
+#ifndef _ASM_ARM_PERF_REGS_H
+#define _ASM_ARM_PERF_REGS_H
+
+enum perf_event_arm_regs {
+ PERF_REG_ARM_R0,
+ PERF_REG_ARM_R1,
+ PERF_REG_ARM_R2,
+ PERF_REG_ARM_R3,
+ PERF_REG_ARM_R4,
+ PERF_REG_ARM_R5,
+ PERF_REG_ARM_R6,
+ PERF_REG_ARM_R7,
+ PERF_REG_ARM_R8,
+ PERF_REG_ARM_R9,
+ PERF_REG_ARM_R10,
+ PERF_REG_ARM_FP,
+ PERF_REG_ARM_IP,
+ PERF_REG_ARM_SP,
+ PERF_REG_ARM_LR,
+ PERF_REG_ARM_PC,
+ PERF_REG_ARM_MAX,
+};
+#endif /* _ASM_ARM_PERF_REGS_H */
obj-$(CONFIG_CPU_MOHAWK) += xscale-cp0.o
obj-$(CONFIG_CPU_PJ4) += pj4-cp0.o
obj-$(CONFIG_IWMMXT) += iwmmxt.o
+obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o perf_event_cpu.o
AFLAGS_iwmmxt.o := -Wa,-mcpu=iwmmxt
obj-$(CONFIG_ARM_CPU_TOPOLOGY) += topology.o
#ifdef CONFIG_ARM_PATCH_PHYS_VIRT
EXPORT_SYMBOL(__pv_phys_offset);
+EXPORT_SYMBOL(__pv_offset);
#endif
ldmfd sp!, {r4 - r6, pc}
ENDPROC(fixup_smp)
+#ifdef __ARMEB_
+#define LOW_OFFSET 0x4
+#define HIGH_OFFSET 0x0
+#else
+#define LOW_OFFSET 0x0
+#define HIGH_OFFSET 0x4
+#endif
+
#ifdef CONFIG_ARM_PATCH_PHYS_VIRT
/* __fixup_pv_table - patch the stub instructions with the delta between
__HEAD
__fixup_pv_table:
adr r0, 1f
- ldmia r0, {r3-r5, r7}
- sub r3, r0, r3 @ PHYS_OFFSET - PAGE_OFFSET
+ ldmia r0, {r3-r7}
+ mvn ip, #0
+ subs r3, r0, r3 @ PHYS_OFFSET - PAGE_OFFSET
add r4, r4, r3 @ adjust table start address
add r5, r5, r3 @ adjust table end address
- add r7, r7, r3 @ adjust __pv_phys_offset address
- str r8, [r7] @ save computed PHYS_OFFSET to __pv_phys_offset
+ add r6, r6, r3 @ adjust __pv_phys_offset address
+ add r7, r7, r3 @ adjust __pv_offset address
+ str r8, [r6, #LOW_OFFSET] @ save computed PHYS_OFFSET to __pv_phys_offset
+ strcc ip, [r7, #HIGH_OFFSET] @ save to __pv_offset high bits
mov r6, r3, lsr #24 @ constant for add/sub instructions
teq r3, r6, lsl #24 @ must be 16MiB aligned
THUMB( it ne @ cross section branch )
bne __error
- str r6, [r7, #4] @ save to __pv_offset
+ str r3, [r7, #LOW_OFFSET] @ save to __pv_offset low bits
b __fixup_a_pv_table
ENDPROC(__fixup_pv_table)
.long __pv_table_begin
.long __pv_table_end
2: .long __pv_phys_offset
+ .long __pv_offset
.text
__fixup_a_pv_table:
+ adr r0, 3f
+ ldr r6, [r0]
+ add r6, r6, r3
+ ldr r0, [r6, #HIGH_OFFSET] @ pv_offset high word
+ ldr r6, [r6, #LOW_OFFSET] @ pv_offset low word
+ mov r6, r6, lsr #24
+ cmn r0, #1
#ifdef CONFIG_THUMB2_KERNEL
+ moveq r0, #0x200000 @ set bit 21, mov to mvn instruction
lsls r6, #24
beq 2f
clz r7, r6
b 2f
1: add r7, r3
ldrh ip, [r7, #2]
- and ip, 0x8f00
- orr ip, r6 @ mask in offset bits 31-24
+ tst ip, #0x4000
+ and ip, #0x8f00
+ orrne ip, r6 @ mask in offset bits 31-24
+ orreq ip, r0 @ mask in offset bits 7-0
strh ip, [r7, #2]
+ ldrheq ip, [r7]
+ biceq ip, #0x20
+ orreq ip, ip, r0, lsr #16
+ strheq ip, [r7]
2: cmp r4, r5
ldrcc r7, [r4], #4 @ use branch for delay slot
bcc 1b
bx lr
#else
+ moveq r0, #0x400000 @ set bit 22, mov to mvn instruction
b 2f
1: ldr ip, [r7, r3]
bic ip, ip, #0x000000ff
- orr ip, ip, r6 @ mask in offset bits 31-24
+ tst ip, #0xf00 @ check the rotation field
+ orrne ip, ip, r6 @ mask in offset bits 31-24
+ biceq ip, ip, #0x400000 @ clear bit 22
+ orreq ip, ip, r0 @ mask in offset bits 7-0
str ip, [r7, r3]
2: cmp r4, r5
ldrcc r7, [r4], #4 @ use branch for delay slot
#endif
ENDPROC(__fixup_a_pv_table)
+3: .long __pv_offset
+
ENTRY(fixup_pv_table)
stmfd sp!, {r4 - r7, lr}
- ldr r2, 2f @ get address of __pv_phys_offset
mov r3, #0 @ no offset
mov r4, r0 @ r0 = table start
add r5, r0, r1 @ r1 = table size
- ldr r6, [r2, #4] @ get __pv_offset
bl __fixup_a_pv_table
ldmfd sp!, {r4 - r7, pc}
ENDPROC(fixup_pv_table)
- .align
-2: .long __pv_phys_offset
-
.data
.globl __pv_phys_offset
.type __pv_phys_offset, %object
__pv_phys_offset:
- .long 0
- .size __pv_phys_offset, . - __pv_phys_offset
+ .quad 0
+ .size __pv_phys_offset, . -__pv_phys_offset
+
+ .globl __pv_offset
+ .type __pv_offset, %object
__pv_offset:
- .long 0
+ .quad 0
+ .size __pv_offset, . -__pv_offset
#endif
#include "head-common.S"
struct perf_event *event)
{
struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
- struct pmu *leader_pmu = event->group_leader->pmu;
if (is_software_event(event))
return 1;
- if (event->pmu != leader_pmu || event->state < PERF_EVENT_STATE_OFF)
+ if (event->state < PERF_EVENT_STATE_OFF)
return 1;
if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec)
--- /dev/null
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/perf_event.h>
+#include <linux/bug.h>
+#include <asm/perf_regs.h>
+#include <asm/ptrace.h>
+
+u64 perf_reg_value(struct pt_regs *regs, int idx)
+{
+ if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM_MAX))
+ return 0;
+
+ return regs->uregs[idx];
+}
+
+#define REG_RESERVED (~((1ULL << PERF_REG_ARM_MAX) - 1))
+
+int perf_reg_validate(u64 mask)
+{
+ if (!mask || mask & REG_RESERVED)
+ return -EINVAL;
+
+ return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+ return PERF_SAMPLE_REGS_ABI_32;
+}
#endif
extern void paging_init(const struct machine_desc *desc);
+extern void early_paging_init(const struct machine_desc *,
+ struct proc_info_list *);
extern void sanity_check_meminfo(void);
extern enum reboot_mode reboot_mode;
extern void setup_dma_zone(const struct machine_desc *desc);
elf_hwcap &= ~(HWCAP_THUMB | HWCAP_IDIVT);
#endif
+ erratum_a15_798181_init();
+
feat_v6_fixup();
cacheid_init();
parse_early_param();
sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL);
+
+ early_paging_init(mdesc, lookup_processor_type(read_cpuid_id()));
sanity_check_meminfo();
arm_memblock_init(&meminfo, mdesc);
* specific registers and some other data for resume.
* r0 = suspend function arg0
* r1 = suspend function
+ * r2 = MPIDR value the resuming CPU will use
*/
ENTRY(__cpu_suspend)
stmfd sp!, {r4 - r11, lr}
mov r5, sp @ current virtual SP
add r4, r4, #12 @ Space for pgd, virt sp, phys resume fn
sub sp, sp, r4 @ allocate CPU state on stack
- stmfd sp!, {r0, r1} @ save suspend func arg and pointer
- add r0, sp, #8 @ save pointer to save block
- mov r1, r4 @ size of save block
- mov r2, r5 @ virtual SP
ldr r3, =sleep_save_sp
+ stmfd sp!, {r0, r1} @ save suspend func arg and pointer
ldr r3, [r3, #SLEEP_SAVE_SP_VIRT]
- ALT_SMP(mrc p15, 0, r9, c0, c0, 5)
- ALT_UP_B(1f)
- ldr r8, =mpidr_hash
- /*
- * This ldmia relies on the memory layout of the mpidr_hash
- * struct mpidr_hash.
- */
- ldmia r8, {r4-r7} @ r4 = mpidr mask (r5,r6,r7) = l[0,1,2] shifts
- compute_mpidr_hash lr, r5, r6, r7, r9, r4
- add r3, r3, lr, lsl #2
-1:
+ ALT_SMP(ldr r0, =mpidr_hash)
+ ALT_UP_B(1f)
+ /* This ldmia relies on the memory layout of the mpidr_hash struct */
+ ldmia r0, {r1, r6-r8} @ r1 = mpidr mask (r6,r7,r8) = l[0,1,2] shifts
+ compute_mpidr_hash r0, r6, r7, r8, r2, r1
+ add r3, r3, r0, lsl #2
+1: mov r2, r5 @ virtual SP
+ mov r1, r4 @ size of save block
+ add r0, sp, #8 @ pointer to save block
bl __cpu_suspend_save
adr lr, BSYM(cpu_suspend_abort)
ldmfd sp!, {r0, pc} @ call suspend fn
IPI_CALL_FUNC,
IPI_CALL_FUNC_SINGLE,
IPI_CPU_STOP,
+ IPI_COMPLETION,
};
static DECLARE_COMPLETION(cpu_running);
static unsigned long get_arch_pgd(pgd_t *pgd)
{
- phys_addr_t pgdir = virt_to_phys(pgd);
+ phys_addr_t pgdir = virt_to_idmap(pgd);
BUG_ON(pgdir & ARCH_PGD_MASK);
return pgdir >> ARCH_PGD_SHIFT;
}
S(IPI_CALL_FUNC, "Function call interrupts"),
S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"),
S(IPI_CPU_STOP, "CPU stop interrupts"),
+ S(IPI_COMPLETION, "completion interrupts"),
};
void show_ipi_list(struct seq_file *p, int prec)
cpu_relax();
}
+static DEFINE_PER_CPU(struct completion *, cpu_completion);
+
+int register_ipi_completion(struct completion *completion, int cpu)
+{
+ per_cpu(cpu_completion, cpu) = completion;
+ return IPI_COMPLETION;
+}
+
+static void ipi_complete(unsigned int cpu)
+{
+ complete(per_cpu(cpu_completion, cpu));
+}
+
/*
* Main handler for inter-processor interrupts
*/
irq_exit();
break;
+ case IPI_COMPLETION:
+ irq_enter();
+ ipi_complete(cpu);
+ irq_exit();
+ break;
+
default:
printk(KERN_CRIT "CPU%u: Unknown IPI message 0x%x\n",
cpu, ipinr);
local_flush_bp_all();
}
+#ifdef CONFIG_ARM_ERRATA_798181
+bool (*erratum_a15_798181_handler)(void);
+
+static bool erratum_a15_798181_partial(void)
+{
+ asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (0));
+ dsb(ish);
+ return false;
+}
+
+static bool erratum_a15_798181_broadcast(void)
+{
+ asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (0));
+ dsb(ish);
+ return true;
+}
+
+void erratum_a15_798181_init(void)
+{
+ unsigned int midr = read_cpuid_id();
+ unsigned int revidr = read_cpuid(CPUID_REVIDR);
+
+ /* Cortex-A15 r0p0..r3p2 w/o ECO fix affected */
+ if ((midr & 0xff0ffff0) != 0x410fc0f0 || midr > 0x413fc0f2 ||
+ (revidr & 0x210) == 0x210) {
+ return;
+ }
+ if (revidr & 0x10)
+ erratum_a15_798181_handler = erratum_a15_798181_partial;
+ else
+ erratum_a15_798181_handler = erratum_a15_798181_broadcast;
+}
+#endif
+
static void ipi_flush_tlb_a15_erratum(void *arg)
{
dmb();
if (!erratum_a15_798181())
return;
- dummy_flush_tlb_a15_erratum();
smp_call_function(ipi_flush_tlb_a15_erratum, NULL, 1);
}
if (!erratum_a15_798181())
return;
- dummy_flush_tlb_a15_erratum();
this_cpu = get_cpu();
a15_erratum_get_cpumask(this_cpu, mm, &mask);
smp_call_function_many(&mask, ipi_flush_tlb_a15_erratum, NULL, 1);
#include <asm/suspend.h>
#include <asm/tlbflush.h>
-extern int __cpu_suspend(unsigned long, int (*)(unsigned long));
+extern int __cpu_suspend(unsigned long, int (*)(unsigned long), u32 cpuid);
extern void cpu_resume_mmu(void);
#ifdef CONFIG_MMU
int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
{
struct mm_struct *mm = current->active_mm;
+ u32 __mpidr = cpu_logical_map(smp_processor_id());
int ret;
if (!idmap_pgd)
* resume (indicated by a zero return code), we need to switch
* back to the correct page tables.
*/
- ret = __cpu_suspend(arg, fn);
+ ret = __cpu_suspend(arg, fn, __mpidr);
if (ret == 0) {
cpu_switch_mm(mm->pgd, mm);
local_flush_bp_all();
#else
int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
{
- return __cpu_suspend(arg, fn);
+ u32 __mpidr = cpu_logical_map(smp_processor_id());
+ return __cpu_suspend(arg, fn, __mpidr);
}
#define idmap_pgd NULL
#endif
and r3, r0, #31 @ Get bit offset
mov r0, r0, lsr #5
add r1, r1, r0, lsl #2 @ Get word offset
+#if __LINUX_ARM_ARCH__ >= 7
+ .arch_extension mp
+ ALT_SMP(W(pldw) [r1])
+ ALT_UP(W(nop))
+#endif
mov r3, r2, lsl r3
1: ldrex r2, [r1]
\instr r2, r2, r3
#include <linux/hardirq.h> /* for in_atomic() */
#include <linux/gfp.h>
#include <linux/highmem.h>
+#include <linux/hugetlb.h>
#include <asm/current.h>
#include <asm/page.h>
return 0;
pmd = pmd_offset(pud, addr);
- if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
+ if (unlikely(pmd_none(*pmd)))
+ return 0;
+
+ /*
+ * A pmd can be bad if it refers to a HugeTLB or THP page.
+ *
+ * Both THP and HugeTLB pages have the same pmd layout
+ * and should not be manipulated by the pte functions.
+ *
+ * Lock the page table for the destination and check
+ * to see that it's still huge and whether or not we will
+ * need to fault on write, or if we have a splitting THP.
+ */
+ if (unlikely(pmd_thp_or_huge(*pmd))) {
+ ptl = ¤t->mm->page_table_lock;
+ spin_lock(ptl);
+ if (unlikely(!pmd_thp_or_huge(*pmd)
+ || pmd_hugewillfault(*pmd)
+ || pmd_trans_splitting(*pmd))) {
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ *ptep = NULL;
+ *ptlp = ptl;
+ return 1;
+ }
+
+ if (unlikely(pmd_bad(*pmd)))
return 0;
pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
from += tocopy;
n -= tocopy;
- pte_unmap_unlock(pte, ptl);
+ if (pte)
+ pte_unmap_unlock(pte, ptl);
+ else
+ spin_unlock(ptl);
}
if (!atomic)
up_read(¤t->mm->mmap_sem);
addr += tocopy;
n -= tocopy;
- pte_unmap_unlock(pte, ptl);
+ if (pte)
+ pte_unmap_unlock(pte, ptl);
+ else
+ spin_unlock(ptl);
}
up_read(¤t->mm->mmap_sem);
* Its called GPCLKR0 in my SA1110 manual.
*/
Ser1SDCR0 |= SDCR0_SUS;
+ MSC1 = (MSC1 & ~0xffff) |
+ MSC_NonBrst | MSC_32BitStMem |
+ MSC_RdAcc(2) | MSC_WrAcc(2) | MSC_Rec(0);
if (!machine_has_neponset())
sa1100_register_uart_fns(&assabet_port_fns);
+++ /dev/null
-/*
- * arch/arm/mach-sa1100/include/mach/gpio.h
- *
- * SA1100 GPIO wrappers for arch-neutral GPIO calls
- *
- * Written by Philipp Zabel <philipp.zabel@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- */
-
-#ifndef __ASM_ARCH_SA1100_GPIO_H
-#define __ASM_ARCH_SA1100_GPIO_H
-
-#include <linux/io.h>
-#include <mach/hardware.h>
-#include <asm/irq.h>
-#include <asm-generic/gpio.h>
-
-#define __ARM_GPIOLIB_COMPLEX
-
-static inline int gpio_get_value(unsigned gpio)
-{
- if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX))
- return GPLR & GPIO_GPIO(gpio);
- else
- return __gpio_get_value(gpio);
-}
-
-static inline void gpio_set_value(unsigned gpio, int value)
-{
- if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX))
- if (value)
- GPSR = GPIO_GPIO(gpio);
- else
- GPCR = GPIO_GPIO(gpio);
- else
- __gpio_set_value(gpio, value);
-}
-
-#define gpio_cansleep __gpio_cansleep
-
-#endif
#ifndef _INCLUDE_H3XXX_H_
#define _INCLUDE_H3XXX_H_
+#include "hardware.h" /* Gives GPIO_MAX */
+
/* Physical memory regions corresponding to chip selects */
#define H3600_EGPIO_PHYS (SA1100_CS5_PHYS + 0x01000000)
#define H3600_BANK_2_PHYS SA1100_CS2_PHYS
static u64 get_coherent_dma_mask(struct device *dev)
{
- u64 mask = (u64)arm_dma_limit;
+ u64 mask = (u64)DMA_BIT_MASK(32);
if (dev) {
mask = dev->coherent_dma_mask;
return 0;
}
- if ((~mask) & (u64)arm_dma_limit) {
- dev_warn(dev, "coherent DMA mask %#llx is smaller "
- "than system GFP_DMA mask %#llx\n",
- mask, (u64)arm_dma_limit);
+ /*
+ * If the mask allows for more memory than we can address,
+ * and we actually have that much memory, then fail the
+ * allocation.
+ */
+ if (sizeof(mask) != sizeof(dma_addr_t) &&
+ mask > (dma_addr_t)~0 &&
+ dma_to_pfn(dev, ~0) > arm_dma_pfn_limit) {
+ dev_warn(dev, "Coherent DMA mask %#llx is larger than dma_addr_t allows\n",
+ mask);
+ dev_warn(dev, "Driver did not use or check the return value from dma_set_coherent_mask()?\n");
+ return 0;
+ }
+
+ /*
+ * Now check that the mask, when translated to a PFN,
+ * fits within the allowable addresses which we can
+ * allocate.
+ */
+ if (dma_to_pfn(dev, mask) < arm_dma_pfn_limit) {
+ dev_warn(dev, "Coherent DMA mask %#llx (pfn %#lx-%#lx) covers a smaller range of system memory than the DMA zone pfn 0x0-%#lx\n",
+ mask,
+ dma_to_pfn(dev, 0), dma_to_pfn(dev, mask) + 1,
+ arm_dma_pfn_limit + 1);
return 0;
}
}
*/
int dma_supported(struct device *dev, u64 mask)
{
- if (mask < (u64)arm_dma_limit)
+ unsigned long limit;
+
+ /*
+ * If the mask allows for more memory than we can address,
+ * and we actually have that much memory, then we must
+ * indicate that DMA to this device is not supported.
+ */
+ if (sizeof(mask) != sizeof(dma_addr_t) &&
+ mask > (dma_addr_t)~0 &&
+ dma_to_pfn(dev, ~0) > arm_dma_pfn_limit)
+ return 0;
+
+ /*
+ * Translate the device's DMA mask to a PFN limit. This
+ * PFN number includes the page which we can DMA to.
+ */
+ limit = dma_to_pfn(dev, mask);
+
+ if (limit < arm_dma_pfn_limit)
return 0;
+
return 1;
}
EXPORT_SYMBOL(dma_supported);
#include <asm/system_info.h>
pgd_t *idmap_pgd;
+phys_addr_t (*arch_virt_to_idmap) (unsigned long x);
#ifdef CONFIG_ARM_LPAE
static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end,
unsigned long addr, end;
unsigned long next;
- addr = virt_to_phys(text_start);
- end = virt_to_phys(text_end);
+ addr = virt_to_idmap(text_start);
+ end = virt_to_idmap(text_end);
+ pr_info("Setting up static identity map for 0x%lx - 0x%lx\n", addr, end);
prot |= PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AF;
if (!idmap_pgd)
return -ENOMEM;
- pr_info("Setting up static identity map for 0x%p - 0x%p\n",
- __idmap_text_start, __idmap_text_end);
identity_mapping_add(idmap_pgd, __idmap_text_start,
__idmap_text_end, 0);
* so a successful GFP_DMA allocation will always satisfy this.
*/
phys_addr_t arm_dma_limit;
+unsigned long arm_dma_pfn_limit;
static void __init arm_adjust_dma_zone(unsigned long *size, unsigned long *hole,
unsigned long dma_size)
arm_dma_limit = PHYS_OFFSET + arm_dma_zone_size - 1;
} else
arm_dma_limit = 0xffffffff;
+ arm_dma_pfn_limit = arm_dma_limit >> PAGE_SHIFT;
#endif
}
* This doesn't seem to be used by the Linux memory manager any
* more, but is used by ll_rw_block. If we can get rid of it, we
* also get rid of some of the stuff above as well.
- *
- * Note: max_low_pfn and max_pfn reflect the number of _pages_ in
- * the system, not the maximum PFN.
*/
- max_low_pfn = max_low - PHYS_PFN_OFFSET;
- max_pfn = max_high - PHYS_PFN_OFFSET;
+ min_low_pfn = min;
+ max_low_pfn = max_low;
+ max_pfn = max_high;
}
/*
static void __init free_highpages(void)
{
#ifdef CONFIG_HIGHMEM
- unsigned long max_low = max_low_pfn + PHYS_PFN_OFFSET;
+ unsigned long max_low = max_low_pfn;
struct memblock_region *mem, *res;
/* set highmem page free */
#ifdef CONFIG_ZONE_DMA
extern phys_addr_t arm_dma_limit;
+extern unsigned long arm_dma_pfn_limit;
#else
#define arm_dma_limit ((phys_addr_t)~0)
+#define arm_dma_pfn_limit (~0ul >> PAGE_SHIFT)
#endif
extern phys_addr_t arm_lowmem_limit;
}
/*
- * We don't use supersection mappings for mmap() on /dev/mem, which
- * means that we can't map the memory area above the 4G barrier into
- * userspace.
+ * Do not allow /dev/mem mappings beyond the supported physical range.
*/
int valid_mmap_phys_addr_range(unsigned long pfn, size_t size)
{
- return !(pfn + (size >> PAGE_SHIFT) > 0x00100000);
+ return (pfn + (size >> PAGE_SHIFT)) <= (1 + (PHYS_MASK >> PAGE_SHIFT));
}
#ifdef CONFIG_STRICT_DEVMEM
#include <asm/highmem.h>
#include <asm/system_info.h>
#include <asm/traps.h>
+#include <asm/procinfo.h>
+#include <asm/memory.h>
#include <asm/mach/arch.h>
#include <asm/mach/map.h>
}
}
+#ifdef CONFIG_ARM_LPAE
+/*
+ * early_paging_init() recreates boot time page table setup, allowing machines
+ * to switch over to a high (>4G) address space on LPAE systems
+ */
+void __init early_paging_init(const struct machine_desc *mdesc,
+ struct proc_info_list *procinfo)
+{
+ pmdval_t pmdprot = procinfo->__cpu_mm_mmu_flags;
+ unsigned long map_start, map_end;
+ pgd_t *pgd0, *pgdk;
+ pud_t *pud0, *pudk, *pud_start;
+ pmd_t *pmd0, *pmdk;
+ phys_addr_t phys;
+ int i;
+
+ if (!(mdesc->init_meminfo))
+ return;
+
+ /* remap kernel code and data */
+ map_start = init_mm.start_code;
+ map_end = init_mm.brk;
+
+ /* get a handle on things... */
+ pgd0 = pgd_offset_k(0);
+ pud_start = pud0 = pud_offset(pgd0, 0);
+ pmd0 = pmd_offset(pud0, 0);
+
+ pgdk = pgd_offset_k(map_start);
+ pudk = pud_offset(pgdk, map_start);
+ pmdk = pmd_offset(pudk, map_start);
+
+ mdesc->init_meminfo();
+
+ /* Run the patch stub to update the constants */
+ fixup_pv_table(&__pv_table_begin,
+ (&__pv_table_end - &__pv_table_begin) << 2);
+
+ /*
+ * Cache cleaning operations for self-modifying code
+ * We should clean the entries by MVA but running a
+ * for loop over every pv_table entry pointer would
+ * just complicate the code.
+ */
+ flush_cache_louis();
+ dsb();
+ isb();
+
+ /* remap level 1 table */
+ for (i = 0; i < PTRS_PER_PGD; pud0++, i++) {
+ set_pud(pud0,
+ __pud(__pa(pmd0) | PMD_TYPE_TABLE | L_PGD_SWAPPER));
+ pmd0 += PTRS_PER_PMD;
+ }
+
+ /* remap pmds for kernel mapping */
+ phys = __pa(map_start) & PMD_MASK;
+ do {
+ *pmdk++ = __pmd(phys | pmdprot);
+ phys += PMD_SIZE;
+ } while (phys < map_end);
+
+ flush_cache_all();
+ cpu_switch_mm(pgd0, &init_mm);
+ cpu_set_ttbr(1, __pa(pgd0) + TTBR1_OFFSET);
+ local_flush_bp_all();
+ local_flush_tlb_all();
+}
+
+#else
+
+void __init early_paging_init(const struct machine_desc *mdesc,
+ struct proc_info_list *procinfo)
+{
+ if (mdesc->init_meminfo)
+ mdesc->init_meminfo();
+}
+
+#endif
+
/*
* paging_init() sets up the page tables, initialises the zone memory
* maps, and sets up the zero page, bad page and bad page tables.
/* needed to ensure proper operation of coherent allocations
* later, in case driver doesn't set it explicitly */
- dma_set_mask(&viodev->dev, DMA_BIT_MASK(64));
- dma_set_coherent_mask(&viodev->dev, DMA_BIT_MASK(64));
+ dma_set_mask_and_coherent(&viodev->dev, DMA_BIT_MASK(64));
}
/* register with generic device framework */
/**
* blk_queue_bounce_limit - set bounce buffer limit for queue
* @q: the request queue for the device
- * @dma_mask: the maximum address the device can handle
+ * @max_addr: the maximum address the device can handle
*
* Description:
* Different hardware can have different requirements as to what pages
* it can do I/O directly to. A low level driver can call
* blk_queue_bounce_limit to have lower memory pages allocated as bounce
- * buffers for doing I/O to pages residing above @dma_mask.
+ * buffers for doing I/O to pages residing above @max_addr.
**/
-void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
+void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr)
{
- unsigned long b_pfn = dma_mask >> PAGE_SHIFT;
+ unsigned long b_pfn = max_addr >> PAGE_SHIFT;
int dma = 0;
q->bounce_gfp = GFP_NOIO;
See <http://csrc.nist.gov/encryption/aes/> for more information.
+config CRYPTO_AES_ARM_BS
+ tristate "Bit sliced AES using NEON instructions"
+ depends on ARM && KERNEL_MODE_NEON
+ select CRYPTO_ALGAPI
+ select CRYPTO_AES_ARM
+ select CRYPTO_ABLK_HELPER
+ help
+ Use a faster and more secure NEON based implementation of AES in CBC,
+ CTR and XTS modes
+
+ Bit sliced AES gives around 45% speedup on Cortex-A15 for CTR mode
+ and for XTS mode encryption, CBC and XTS mode decryption speedup is
+ around 25%. (CBC encryption speed is not affected by this driver.)
+ This implementation does not rely on any lookup tables so it is
+ believed to be invulnerable to cache timing attacks.
+
config CRYPTO_ANUBIS
tristate "Anubis cipher algorithm"
select CRYPTO_ALGAPI
if (!dev)
return ERR_PTR(-ENOMEM);
- dev->dma_mask = dma_mask;
dev->dev.coherent_dma_mask = dma_mask;
dev->irq[0] = irq1;
dev->irq[1] = irq2;
dev_set_name(&dev->dev, "%s", name);
dev->dev.release = amba_device_release;
dev->dev.bus = &amba_bustype;
- dev->dev.dma_mask = &dev->dma_mask;
+ dev->dev.dma_mask = &dev->dev.coherent_dma_mask;
dev->res.name = dev_name(&dev->dev);
}
amba_device_initialize(dev, dev->dev.init_name);
dev->dev.init_name = NULL;
- if (!dev->dev.coherent_dma_mask && dev->dma_mask)
- dev_warn(&dev->dev, "coherent dma mask is unset\n");
-
return amba_device_add(dev, parent);
}
struct ata_host *host;
struct ata_port *ap;
struct ixp4xx_pata_data *data = dev_get_platdata(&pdev->dev);
+ int ret;
cs0 = platform_get_resource(pdev, IORESOURCE_MEM, 0);
cs1 = platform_get_resource(pdev, IORESOURCE_MEM, 1);
return -ENOMEM;
/* acquire resources and fill host */
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
data->cs0 = devm_ioremap(&pdev->dev, cs0->start, 0x1000);
data->cs1 = devm_ioremap(&pdev->dev, cs1->start, 0x1000);
}
cf_port->c0 = ap->ioaddr.ctl_addr;
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(64);
- pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
+ rv = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+ if (rv)
+ return ret;
ata_port_desc(ap, "cmd %p ctl %p", base, ap->ioaddr.ctl_addr);
if (pci_request_selected_regions(pdev, bars, "nvme"))
goto disable_pci;
- if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)))
- dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
- else if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(32)))
- dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32));
- else
- goto disable_pci;
+ if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) &&
+ dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)))
+ goto disable;
pci_set_drvdata(pdev, dev);
dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
INIT_LIST_HEAD(&dev->namespaces);
dev->pci_dev = pdev;
+
result = nvme_set_instance(dev);
if (result)
goto free;
static int support_aes = 1;
-static void dev_release(struct device *dev)
-{
- return;
-}
-
#define DRIVER_NAME "ixp4xx_crypto"
-static struct platform_device pseudo_dev = {
- .name = DRIVER_NAME,
- .id = 0,
- .num_resources = 0,
- .dev = {
- .coherent_dma_mask = DMA_BIT_MASK(32),
- .release = dev_release,
- }
-};
-static struct device *dev = &pseudo_dev.dev;
+static struct platform_device *pdev;
static inline dma_addr_t crypt_virt2phys(struct crypt_ctl *virt)
{
static int setup_crypt_desc(void)
{
+ struct device *dev = &pdev->dev;
BUILD_BUG_ON(sizeof(struct crypt_ctl) != 64);
crypt_virt = dma_alloc_coherent(dev,
NPE_QLEN * sizeof(struct crypt_ctl),
static void one_packet(dma_addr_t phys)
{
+ struct device *dev = &pdev->dev;
struct crypt_ctl *crypt;
struct ixp_ctx *ctx;
int failed;
tasklet_schedule(&crypto_done_tasklet);
}
-static int init_ixp_crypto(void)
+static int init_ixp_crypto(struct device *dev)
{
int ret = -ENODEV;
u32 msg[2] = { 0, 0 };
return ret;
}
-static void release_ixp_crypto(void)
+static void release_ixp_crypto(struct device *dev)
{
qmgr_disable_irq(RECV_QID);
tasklet_kill(&crypto_done_tasklet);
enum dma_data_direction src_direction = DMA_BIDIRECTIONAL;
struct ablk_ctx *req_ctx = ablkcipher_request_ctx(req);
struct buffer_desc src_hook;
+ struct device *dev = &pdev->dev;
gfp_t flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ?
GFP_KERNEL : GFP_ATOMIC;
unsigned int cryptlen;
struct buffer_desc *buf, src_hook;
struct aead_ctx *req_ctx = aead_request_ctx(req);
+ struct device *dev = &pdev->dev;
gfp_t flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ?
GFP_KERNEL : GFP_ATOMIC;
} };
#define IXP_POSTFIX "-ixp4xx"
+
+static const struct platform_device_info ixp_dev_info __initdata = {
+ .name = DRIVER_NAME,
+ .id = 0,
+ .dma_mask = DMA_BIT_MASK(32),
+};
+
static int __init ixp_module_init(void)
{
int num = ARRAY_SIZE(ixp4xx_algos);
- int i,err ;
+ int i, err ;
- if (platform_device_register(&pseudo_dev))
- return -ENODEV;
+ pdev = platform_device_register_full(&ixp_dev_info);
+ if (IS_ERR(pdev))
+ return PTR_ERR(pdev);
+
+ dev = &pdev->dev;
spin_lock_init(&desc_lock);
spin_lock_init(&emerg_lock);
- err = init_ixp_crypto();
+ err = init_ixp_crypto(&pdev->dev);
if (err) {
- platform_device_unregister(&pseudo_dev);
+ platform_device_unregister(pdev);
return err;
}
for (i=0; i< num; i++) {
if (ixp4xx_algos[i].registered)
crypto_unregister_alg(&ixp4xx_algos[i].crypto);
}
- release_ixp_crypto();
- platform_device_unregister(&pseudo_dev);
+ release_ixp_crypto(&pdev->dev);
+ platform_device_unregister(pdev);
}
module_init(ixp_module_init);
if (ret)
return ret;
+ /* Ensure that we can do DMA */
+ ret = dma_set_mask_and_coherent(&adev->dev, DMA_BIT_MASK(32));
+ if (ret)
+ goto out_no_pl08x;
+
/* Create the driver state holder */
pl08x = kzalloc(sizeof(*pl08x), GFP_KERNEL);
if (!pl08x) {
if (IS_ERR(chip->regs))
return PTR_ERR(chip->regs);
- /* Apply default dma_mask if needed */
- if (!dev->dma_mask) {
- dev->dma_mask = &dev->coherent_dma_mask;
- dev->coherent_dma_mask = DMA_BIT_MASK(32);
- }
+ err = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (err)
+ return err;
pdata = dev_get_platdata(dev);
if (!pdata)
struct edma_cc *ecc;
int ret;
+ ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
+
ecc = devm_kzalloc(&pdev->dev, sizeof(*ecc), GFP_KERNEL);
if (!ecc) {
dev_err(&pdev->dev, "Can't allocate controller\n");
static const struct platform_device_info edma_dev_info0 = {
.name = "edma-dma-engine",
.id = 0,
+ .dma_mask = DMA_BIT_MASK(32),
};
static const struct platform_device_info edma_dev_info1 = {
.name = "edma-dma-engine",
.id = 1,
+ .dma_mask = DMA_BIT_MASK(32),
};
static int edma_init(void)
ret = PTR_ERR(pdev0);
goto out;
}
- pdev0->dev.dma_mask = &pdev0->dev.coherent_dma_mask;
- pdev0->dev.coherent_dma_mask = DMA_BIT_MASK(32);
}
if (EDMA_CTLRS == 2) {
platform_device_unregister(pdev0);
ret = PTR_ERR(pdev1);
}
- pdev1->dev.dma_mask = &pdev1->dev.coherent_dma_mask;
- pdev1->dev.coherent_dma_mask = DMA_BIT_MASK(32);
}
out:
return -EINVAL;
}
+ ret = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
+
sdma = kzalloc(sizeof(*sdma), GFP_KERNEL);
if (!sdma)
return -ENOMEM;
pdat = dev_get_platdata(&adev->dev);
+ ret = dma_set_mask_and_coherent(&adev->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
+
/* Allocate a new DMAC and its Channels */
pdmac = devm_kzalloc(&adev->dev, sizeof(*pdmac), GFP_KERNEL);
if (!pdmac) {
host_control_action = HC_ACTION_NONE;
host_control_smi_type = HC_SMITYPE_NONE;
+ dcdbas_pdev = dev;
+
/*
* BIOS SMI calls require buffer addresses be in 32-bit address space.
* This is done by setting the DMA mask below.
*/
- dcdbas_pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
- dcdbas_pdev->dev.dma_mask = &dcdbas_pdev->dev.coherent_dma_mask;
+ error = dma_set_coherent_mask(&dcdbas_pdev->dev, DMA_BIT_MASK(32));
+ if (error)
+ return error;
error = sysfs_create_group(&dev->dev.kobj, &dcdbas_attr_group);
if (error)
.remove = dcdbas_remove,
};
+static const struct platform_device_info dcdbas_dev_info __initdata = {
+ .name = DRIVER_NAME,
+ .id = -1,
+ .dma_mask = DMA_BIT_MASK(32),
+};
+
+static struct platform_device *dcdbas_pdev_reg;
+
/**
* dcdbas_init: initialize driver
*/
if (error)
return error;
- dcdbas_pdev = platform_device_alloc(DRIVER_NAME, -1);
- if (!dcdbas_pdev) {
- error = -ENOMEM;
+ dcdbas_pdev_reg = platform_device_register_full(&dcdbas_dev_info);
+ if (IS_ERR(dcdbas_pdev_reg)) {
+ error = PTR_ERR(dcdbas_pdev_reg);
goto err_unregister_driver;
}
- error = platform_device_add(dcdbas_pdev);
- if (error)
- goto err_free_device;
-
return 0;
- err_free_device:
- platform_device_put(dcdbas_pdev);
err_unregister_driver:
platform_driver_unregister(&dcdbas_driver);
return error;
* all sysfs attributes belonging to this module have been
* released.
*/
- smi_data_buf_free();
- platform_device_unregister(dcdbas_pdev);
+ if (dcdbas_pdev)
+ smi_data_buf_free();
+ platform_device_unregister(dcdbas_pdev_reg);
platform_driver_unregister(&dcdbas_driver);
}
static struct kobject *gsmi_kobj;
static struct efivars efivars;
+static const struct platform_device_info gsmi_dev_info = {
+ .name = "gsmi",
+ .id = -1,
+ /* SMI callbacks require 32bit addresses */
+ .dma_mask = DMA_BIT_MASK(32),
+};
+
static __init int gsmi_init(void)
{
unsigned long flags;
gsmi_dev.smi_cmd = acpi_gbl_FADT.smi_command;
/* register device */
- gsmi_dev.pdev = platform_device_register_simple("gsmi", -1, NULL, 0);
+ gsmi_dev.pdev = platform_device_register_full(&gsmi_dev_info);
if (IS_ERR(gsmi_dev.pdev)) {
printk(KERN_ERR "gsmi: unable to register platform device\n");
return PTR_ERR(gsmi_dev.pdev);
/* SMI access needs to be serialized */
spin_lock_init(&gsmi_dev.lock);
- /* SMI callbacks require 32bit addresses */
- gsmi_dev.pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
- gsmi_dev.pdev->dev.dma_mask =
- &gsmi_dev.pdev->dev.coherent_dma_mask;
ret = -ENOMEM;
gsmi_dev.dma_pool = dma_pool_create("gsmi", &gsmi_dev.pdev->dev,
GSMI_BUF_SIZE, GSMI_BUF_ALIGN, 0);
#include <linux/gpio.h>
#include <linux/init.h>
#include <linux/module.h>
-
+#include <linux/io.h>
#include <mach/hardware.h>
#include <mach/irqs.h>
static int exynos_drm_platform_probe(struct platform_device *pdev)
{
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+ int ret;
+
+ ret = dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
return drm_platform_init(&exynos_drm_driver, pdev);
}
}
/* set dma mask for device */
- /* NOTE: this is a workaround for the hwmod not initializing properly */
- dev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_set_coherent_mask(&dev->dev, DMA_BIT_MASK(32));
+ if (ret)
+ goto fail;
omap_dmm->dummy_pa = page_to_phys(omap_dmm->dummy_page);
if (cpu >= NR_GIC_CPU_IF || cpu >= nr_cpu_ids)
return -EINVAL;
+ raw_spin_lock(&irq_controller_lock);
mask = 0xff << shift;
bit = gic_cpu_map[cpu] << shift;
-
- raw_spin_lock(&irq_controller_lock);
val = readl_relaxed(reg) & ~mask;
writel_relaxed(val | bit, reg);
raw_spin_unlock(&irq_controller_lock);
void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
{
int cpu;
- unsigned long map = 0;
+ unsigned long flags, map = 0;
+
+ raw_spin_lock_irqsave(&irq_controller_lock, flags);
/* Convert our logical CPU mask into a physical one. */
for_each_cpu(cpu, mask)
/* this always happens on GIC0 */
writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
+
+ raw_spin_unlock_irqrestore(&irq_controller_lock, flags);
+}
+#endif
+
+#ifdef CONFIG_BL_SWITCHER
+/*
+ * gic_send_sgi - send a SGI directly to given CPU interface number
+ *
+ * cpu_id: the ID for the destination CPU interface
+ * irq: the IPI number to send a SGI for
+ */
+void gic_send_sgi(unsigned int cpu_id, unsigned int irq)
+{
+ BUG_ON(cpu_id >= NR_GIC_CPU_IF);
+ cpu_id = 1 << cpu_id;
+ /* this always happens on GIC0 */
+ writel_relaxed((cpu_id << 16) | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
+}
+
+/*
+ * gic_get_cpu_id - get the CPU interface ID for the specified CPU
+ *
+ * @cpu: the logical CPU number to get the GIC ID for.
+ *
+ * Return the CPU interface ID for the given logical CPU number,
+ * or -1 if the CPU number is too large or the interface ID is
+ * unknown (more than one bit set).
+ */
+int gic_get_cpu_id(unsigned int cpu)
+{
+ unsigned int cpu_bit;
+
+ if (cpu >= NR_GIC_CPU_IF)
+ return -1;
+ cpu_bit = gic_cpu_map[cpu];
+ if (cpu_bit & (cpu_bit - 1))
+ return -1;
+ return __ffs(cpu_bit);
}
+
+/*
+ * gic_migrate_target - migrate IRQs to another CPU interface
+ *
+ * @new_cpu_id: the CPU target ID to migrate IRQs to
+ *
+ * Migrate all peripheral interrupts with a target matching the current CPU
+ * to the interface corresponding to @new_cpu_id. The CPU interface mapping
+ * is also updated. Targets to other CPU interfaces are unchanged.
+ * This must be called with IRQs locally disabled.
+ */
+void gic_migrate_target(unsigned int new_cpu_id)
+{
+ unsigned int cur_cpu_id, gic_irqs, gic_nr = 0;
+ void __iomem *dist_base;
+ int i, ror_val, cpu = smp_processor_id();
+ u32 val, cur_target_mask, active_mask;
+
+ if (gic_nr >= MAX_GIC_NR)
+ BUG();
+
+ dist_base = gic_data_dist_base(&gic_data[gic_nr]);
+ if (!dist_base)
+ return;
+ gic_irqs = gic_data[gic_nr].gic_irqs;
+
+ cur_cpu_id = __ffs(gic_cpu_map[cpu]);
+ cur_target_mask = 0x01010101 << cur_cpu_id;
+ ror_val = (cur_cpu_id - new_cpu_id) & 31;
+
+ raw_spin_lock(&irq_controller_lock);
+
+ /* Update the target interface for this logical CPU */
+ gic_cpu_map[cpu] = 1 << new_cpu_id;
+
+ /*
+ * Find all the peripheral interrupts targetting the current
+ * CPU interface and migrate them to the new CPU interface.
+ * We skip DIST_TARGET 0 to 7 as they are read-only.
+ */
+ for (i = 8; i < DIV_ROUND_UP(gic_irqs, 4); i++) {
+ val = readl_relaxed(dist_base + GIC_DIST_TARGET + i * 4);
+ active_mask = val & cur_target_mask;
+ if (active_mask) {
+ val &= ~active_mask;
+ val |= ror32(active_mask, ror_val);
+ writel_relaxed(val, dist_base + GIC_DIST_TARGET + i*4);
+ }
+ }
+
+ raw_spin_unlock(&irq_controller_lock);
+
+ /*
+ * Now let's migrate and clear any potential SGIs that might be
+ * pending for us (cur_cpu_id). Since GIC_DIST_SGI_PENDING_SET
+ * is a banked register, we can only forward the SGI using
+ * GIC_DIST_SOFTINT. The original SGI source is lost but Linux
+ * doesn't use that information anyway.
+ *
+ * For the same reason we do not adjust SGI source information
+ * for previously sent SGIs by us to other CPUs either.
+ */
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ val = readl_relaxed(dist_base + GIC_DIST_SGI_PENDING_SET + i);
+ if (!val)
+ continue;
+ writel_relaxed(val, dist_base + GIC_DIST_SGI_PENDING_CLEAR + i);
+ for (j = i; j < i + 4; j++) {
+ if (val & 0xff)
+ writel_relaxed((1 << (new_cpu_id + 16)) | j,
+ dist_base + GIC_DIST_SOFTINT);
+ val >>= 8;
+ }
+ }
+}
+
+/*
+ * gic_get_sgir_physaddr - get the physical address for the SGI register
+ *
+ * REturn the physical address of the SGI register to be used
+ * by some early assembly code when the kernel is not yet available.
+ */
+static unsigned long gic_dist_physaddr;
+
+unsigned long gic_get_sgir_physaddr(void)
+{
+ if (!gic_dist_physaddr)
+ return 0;
+ return gic_dist_physaddr + GIC_DIST_SOFTINT;
+}
+
+void __init gic_init_physaddr(struct device_node *node)
+{
+ struct resource res;
+ if (of_address_to_resource(node, 0, &res) == 0) {
+ gic_dist_physaddr = res.start;
+ pr_info("GIC physical location is %#lx\n", gic_dist_physaddr);
+ }
+}
+
+#else
+#define gic_init_physaddr(node) do { } while (0)
#endif
static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
percpu_offset = 0;
gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset, node);
+ if (!gic_cnt)
+ gic_init_physaddr(node);
if (parent) {
irq = irq_of_parse_and_map(node, 0);
isp->pdata = pdata;
isp->ref_count = 0;
- isp->raw_dmamask = DMA_BIT_MASK(32);
- isp->dev->dma_mask = &isp->raw_dmamask;
- isp->dev->coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_coerce_mask_and_coherent(isp->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
platform_set_drvdata(pdev, isp);
* @mmio_base_phys: Array with physical L4 bus addresses for ISP register
* regions.
* @mmio_size: Array with ISP register regions size in bytes.
- * @raw_dmamask: Raw DMA mask
* @stat_lock: Spinlock for handling statistics
* @isp_mutex: Mutex for serializing requests to ISP.
* @crashed: Bitmask of crashed entities (indexed by entity ID)
unsigned long mmio_base_phys[OMAP3_ISP_IOMEM_LAST];
resource_size_t mmio_size[OMAP3_ISP_IOMEM_LAST];
- u64 raw_dmamask;
-
/* ISP Obj */
spinlock_t stat_lock; /* common lock for statistic drivers */
struct mutex isp_mutex; /* For handling ref_count field */
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/scatterlist.h>
+#include <linux/dma-mapping.h>
#include <linux/mmc/card.h>
#include <linux/mmc/host.h>
struct mmc_queue_req *mqrq_prev = &mq->mqrq[1];
if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask)
- limit = *mmc_dev(host)->dma_mask;
+ limit = dma_max_pfn(mmc_dev(host)) << PAGE_SHIFT;
mq->card = card;
mq->queue = blk_init_queue(mmc_request_fn, lock);
* @signal_direction: input/out direction of bus signals can be indicated
* @pwrreg_clkgate: MMCIPOWER register must be used to gate the clock
* @busy_detect: true if busy detection on dat0 is supported
+ * @pwrreg_nopower: bits in MMCIPOWER don't controls ext. power supply
*/
struct variant_data {
unsigned int clkreg;
bool signal_direction;
bool pwrreg_clkgate;
bool busy_detect;
+ bool pwrreg_nopower;
};
static struct variant_data variant_arm = {
.pwrreg_powerup = MCI_PWR_ON,
.signal_direction = true,
.pwrreg_clkgate = true,
+ .pwrreg_nopower = true,
};
static struct variant_data variant_nomadik = {
.pwrreg_powerup = MCI_PWR_ON,
.signal_direction = true,
.pwrreg_clkgate = true,
+ .pwrreg_nopower = true,
};
static struct variant_data variant_ux500 = {
.signal_direction = true,
.pwrreg_clkgate = true,
.busy_detect = true,
+ .pwrreg_nopower = true,
};
static struct variant_data variant_ux500v2 = {
.signal_direction = true,
.pwrreg_clkgate = true,
.busy_detect = true,
+ .pwrreg_nopower = true,
};
static int mmci_card_busy(struct mmc_host *mmc)
return 0;
}
+static void mmci_reg_delay(struct mmci_host *host)
+{
+ /*
+ * According to the spec, at least three feedback clock cycles
+ * of max 52 MHz must pass between two writes to the MMCICLOCK reg.
+ * Three MCLK clock cycles must pass between two MMCIPOWER reg writes.
+ * Worst delay time during card init is at 100 kHz => 30 us.
+ * Worst delay time when up and running is at 25 MHz => 120 ns.
+ */
+ if (host->cclk < 25000000)
+ udelay(30);
+ else
+ ndelay(120);
+}
+
/*
* This must be called with host->lock held
*/
mmci_set_clkreg(host, ios->clock);
mmci_write_pwrreg(host, pwr);
+ mmci_reg_delay(host);
spin_unlock_irqrestore(&host->lock, flags);
mmc->f_max = min(host->mclk, fmax);
dev_dbg(mmc_dev(mmc), "clocking block at %u Hz\n", mmc->f_max);
- host->pinctrl = devm_pinctrl_get(&dev->dev);
- if (IS_ERR(host->pinctrl)) {
- ret = PTR_ERR(host->pinctrl);
- goto clk_disable;
- }
-
- host->pins_default = pinctrl_lookup_state(host->pinctrl,
- PINCTRL_STATE_DEFAULT);
-
- /* enable pins to be muxed in and configured */
- if (!IS_ERR(host->pins_default)) {
- ret = pinctrl_select_state(host->pinctrl, host->pins_default);
- if (ret)
- dev_warn(&dev->dev, "could not set default pins\n");
- } else
- dev_warn(&dev->dev, "could not get default pinstate\n");
-
/* Get regulators and the supported OCR mask */
mmc_regulator_get_supply(mmc);
if (!mmc->ocr_avail)
#endif
#ifdef CONFIG_PM_RUNTIME
+static void mmci_save(struct mmci_host *host)
+{
+ unsigned long flags;
+
+ if (host->variant->pwrreg_nopower) {
+ spin_lock_irqsave(&host->lock, flags);
+
+ writel(0, host->base + MMCIMASK0);
+ writel(0, host->base + MMCIDATACTRL);
+ writel(0, host->base + MMCIPOWER);
+ writel(0, host->base + MMCICLOCK);
+ mmci_reg_delay(host);
+
+ spin_unlock_irqrestore(&host->lock, flags);
+ }
+
+}
+
+static void mmci_restore(struct mmci_host *host)
+{
+ unsigned long flags;
+
+ if (host->variant->pwrreg_nopower) {
+ spin_lock_irqsave(&host->lock, flags);
+
+ writel(host->clk_reg, host->base + MMCICLOCK);
+ writel(host->datactrl_reg, host->base + MMCIDATACTRL);
+ writel(host->pwr_reg, host->base + MMCIPOWER);
+ writel(MCI_IRQENABLE, host->base + MMCIMASK0);
+ mmci_reg_delay(host);
+
+ spin_unlock_irqrestore(&host->lock, flags);
+ }
+}
+
static int mmci_runtime_suspend(struct device *dev)
{
struct amba_device *adev = to_amba_device(dev);
if (mmc) {
struct mmci_host *host = mmc_priv(mmc);
+ pinctrl_pm_select_sleep_state(dev);
+ mmci_save(host);
clk_disable_unprepare(host->clk);
}
if (mmc) {
struct mmci_host *host = mmc_priv(mmc);
clk_prepare_enable(host->clk);
+ mmci_restore(host);
+ pinctrl_pm_select_default_state(dev);
}
return 0;
struct sg_mapping_iter sg_miter;
unsigned int size;
- /* pinctrl handles */
- struct pinctrl *pinctrl;
- struct pinctrl_state *pins_default;
-
#ifdef CONFIG_DMA_ENGINE
/* DMA stuff */
struct dma_chan *dma_current;
dma_mask = DMA_BIT_MASK(32);
}
- dev->dma_mask = &dev->coherent_dma_mask;
- dev->coherent_dma_mask = dma_mask;
+ err = dma_coerce_mask_and_coherent(dev, dma_mask);
+ if (err)
+ goto err_free;
}
if (c->slot) {
goto err_out_free_dev;
}
- if (dma_set_mask(sdev->dma_dev, DMA_BIT_MASK(30)) ||
- dma_set_coherent_mask(sdev->dma_dev, DMA_BIT_MASK(30))) {
+ if (dma_set_mask_and_coherent(sdev->dma_dev, DMA_BIT_MASK(30))) {
dev_err(sdev->dev,
"Required 30BIT DMA mask unsupported by the system\n");
goto err_out_powerdown;
{
struct device *dev = &bp->pdev->dev;
- if (dma_set_mask(dev, DMA_BIT_MASK(64)) == 0) {
- if (dma_set_coherent_mask(dev, DMA_BIT_MASK(64)) != 0) {
- dev_err(dev, "dma_set_coherent_mask failed, aborting\n");
- return -EIO;
- }
- } else if (dma_set_mask(dev, DMA_BIT_MASK(32)) != 0) {
+ if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)) != 0 &&
+ dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32)) != 0) {
dev_err(dev, "System does not support DMA, aborting\n");
return -EIO;
}
err = pci_request_regions(pdev, BNAD_NAME);
if (err)
goto disable_device;
- if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)) &&
- !dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64))) {
+ if (!dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64))) {
*using_dac = true;
} else {
- err = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
- if (err) {
- err = dma_set_coherent_mask(&pdev->dev,
- DMA_BIT_MASK(32));
- if (err)
- goto release_regions;
- }
+ err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (err)
+ goto release_regions;
*using_dac = false;
}
pci_set_master(pdev);
adapter->netdev = netdev;
SET_NETDEV_DEV(netdev, &pdev->dev);
- status = dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
+ status = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
if (!status) {
- status = dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
- if (status < 0) {
- dev_err(&pdev->dev, "dma_set_coherent_mask failed\n");
- goto free_netdev;
- }
netdev->features |= NETIF_F_HIGHDMA;
} else {
- status = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
- if (!status)
- status = dma_set_coherent_mask(&pdev->dev,
- DMA_BIT_MASK(32));
+ status = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
if (status) {
dev_err(&pdev->dev, "Could not set PCI DMA Mask\n");
goto free_netdev;
*/
pci_using_dac = 0;
if ((hw->bus_type == e1000_bus_type_pcix) &&
- !dma_set_mask(&pdev->dev, DMA_BIT_MASK(64))) {
- /* according to DMA-API-HOWTO, coherent calls will always
- * succeed if the set call did
- */
- dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
+ !dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64))) {
pci_using_dac = 1;
} else {
- err = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
+ err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
if (err) {
pr_err("No usable DMA config, aborting\n");
goto err_dma;
}
- dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32));
}
netdev->netdev_ops = &e1000_netdev_ops;
return err;
pci_using_dac = 0;
- err = dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
+ err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
if (!err) {
- err = dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
- if (!err)
- pci_using_dac = 1;
+ pci_using_dac = 1;
} else {
- err = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
+ err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
if (err) {
- err = dma_set_coherent_mask(&pdev->dev,
- DMA_BIT_MASK(32));
- if (err) {
- dev_err(&pdev->dev,
- "No usable DMA configuration, aborting\n");
- goto err_dma;
- }
+ dev_err(&pdev->dev,
+ "No usable DMA configuration, aborting\n");
+ goto err_dma;
}
}
return err;
pci_using_dac = 0;
- err = dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
+ err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
if (!err) {
- err = dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
- if (!err)
- pci_using_dac = 1;
+ pci_using_dac = 1;
} else {
- err = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
+ err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
if (err) {
- err = dma_set_coherent_mask(&pdev->dev,
- DMA_BIT_MASK(32));
- if (err) {
- dev_err(&pdev->dev,
- "No usable DMA configuration, aborting\n");
- goto err_dma;
- }
+ dev_err(&pdev->dev,
+ "No usable DMA configuration, aborting\n");
+ goto err_dma;
}
}
return err;
pci_using_dac = 0;
- err = dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
+ err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
if (!err) {
- err = dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
- if (!err)
- pci_using_dac = 1;
+ pci_using_dac = 1;
} else {
- err = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
+ err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
if (err) {
- err = dma_set_coherent_mask(&pdev->dev,
- DMA_BIT_MASK(32));
- if (err) {
- dev_err(&pdev->dev, "No usable DMA "
- "configuration, aborting\n");
- goto err_dma;
- }
+ dev_err(&pdev->dev, "No usable DMA "
+ "configuration, aborting\n");
+ goto err_dma;
}
}
return err;
pci_using_dac = 0;
- err = dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
+ err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
if (!err) {
- err = dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
- if (!err)
- pci_using_dac = 1;
+ pci_using_dac = 1;
} else {
- err = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
+ err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
if (err) {
- err = dma_set_coherent_mask(&pdev->dev,
- DMA_BIT_MASK(32));
- if (err) {
- pr_err("No usable DMA configuration, aborting\n");
- goto err_dma_mask;
- }
+ pr_err("No usable DMA configuration, aborting\n");
+ goto err_dma_mask;
}
}
if (err)
return err;
- if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)) &&
- !dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64))) {
+ if (!dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64))) {
pci_using_dac = 1;
} else {
- err = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
+ err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
if (err) {
- err = dma_set_coherent_mask(&pdev->dev,
- DMA_BIT_MASK(32));
- if (err) {
- dev_err(&pdev->dev,
- "No usable DMA configuration, aborting\n");
- goto err_dma;
- }
+ dev_err(&pdev->dev,
+ "No usable DMA configuration, aborting\n");
+ goto err_dma;
}
pci_using_dac = 0;
}
if (err)
return err;
- if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)) &&
- !dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64))) {
+ if (!dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64))) {
pci_using_dac = 1;
} else {
- err = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
+ err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
if (err) {
- err = dma_set_coherent_mask(&pdev->dev,
- DMA_BIT_MASK(32));
- if (err) {
- dev_err(&pdev->dev, "No usable DMA "
- "configuration, aborting\n");
- goto err_dma;
- }
+ dev_err(&pdev->dev, "No usable DMA "
+ "configuration, aborting\n");
+ goto err_dma;
}
pci_using_dac = 0;
}
}
if (pldat->dma_buff_base_v == 0) {
- pldat->pdev->dev.coherent_dma_mask = 0xFFFFFFFF;
- pldat->pdev->dev.dma_mask = &pldat->pdev->dev.coherent_dma_mask;
+ ret = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (ret)
+ goto err_out_free_irq;
+
pldat->dma_buff_size = PAGE_ALIGN(pldat->dma_buff_size);
/* Allocate a chunk of memory for the DMA ethernet buffers
p->phy_np = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0);
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(64);
- pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
+ result = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+ if (result)
+ goto err;
netif_carrier_off(netdev);
result = register_netdev(netdev);
*/
while (dma_mask > 0x7fffffffUL) {
if (dma_supported(&pci_dev->dev, dma_mask)) {
- rc = dma_set_mask(&pci_dev->dev, dma_mask);
+ rc = dma_set_mask_and_coherent(&pci_dev->dev, dma_mask);
if (rc == 0)
break;
}
}
netif_dbg(efx, probe, efx->net_dev,
"using DMA mask %llx\n", (unsigned long long) dma_mask);
- rc = dma_set_coherent_mask(&pci_dev->dev, dma_mask);
- if (rc) {
- /* dma_set_coherent_mask() is not *allowed* to
- * fail with a mask that dma_set_mask() accepted,
- * but just in case...
- */
- netif_err(efx, probe, efx->net_dev,
- "failed to set consistent DMA mask\n");
- goto fail2;
- }
efx->membase_phys = pci_resource_start(efx->pci_dev, EFX_MEM_BAR);
rc = pci_request_region(pci_dev, EFX_MEM_BAR, "sfc");
/* Try to set the DMA mask. If it fails, try falling back to a
* lower mask, as we can always also support a lower one. */
while (1) {
- err = dma_set_mask(dev->dev->dma_dev, mask);
- if (!err) {
- err = dma_set_coherent_mask(dev->dev->dma_dev, mask);
- if (!err)
- break;
- }
+ err = dma_set_mask_and_coherent(dev->dev->dma_dev, mask);
+ if (!err)
+ break;
if (mask == DMA_BIT_MASK(64)) {
mask = DMA_BIT_MASK(32);
fallback = true;
/* Try to set the DMA mask. If it fails, try falling back to a
* lower mask, as we can always also support a lower one. */
while (1) {
- err = dma_set_mask(dev->dev->dma_dev, mask);
- if (!err) {
- err = dma_set_coherent_mask(dev->dev->dma_dev, mask);
- if (!err)
- break;
- }
+ err = dma_set_mask_and_coherent(dev->dev->dma_dev, mask);
+ if (!err)
+ break;
if (mask == DMA_BIT_MASK(64)) {
mask = DMA_BIT_MASK(32);
fallback = true;
else
of_device_make_bus_id(&dev->dev);
- /* setup amba-specific device info */
- dev->dma_mask = ~0;
-
/* Allow the HW Peripheral ID to be overridden */
prop = of_get_property(node, "arm,primecell-periphid", NULL);
if (prop)
struct resource *ECR_res = NULL;
struct resource *EPP_res = NULL;
struct platform_device *pdev = NULL;
+ int ret;
if (!dev) {
/* We need a physical device to attach to, but none was
return NULL;
dev = &pdev->dev;
- dev->coherent_dma_mask = DMA_BIT_MASK(24);
- dev->dma_mask = &dev->coherent_dma_mask;
+ ret = dma_coerce_mask_and_coherent(dev, DMA_BIT_MASK(24));
+ if (ret) {
+ dev_err(dev, "Unable to set coherent dma mask: disabling DMA\n");
+ dma = PARPORT_DMA_NONE;
+ }
}
ops = kmalloc(sizeof(struct parport_operations), GFP_KERNEL);
host_dev = scsi_get_device(shost);
if (host_dev && host_dev->dma_mask)
- bounce_limit = *host_dev->dma_mask;
+ bounce_limit = dma_max_pfn(host_dev) << PAGE_SHIFT;
return bounce_limit;
}
*/
if (!dev->dev.dma_mask)
dev->dev.dma_mask = &dev->dev.coherent_dma_mask;
- if (!dev->dev.coherent_dma_mask)
- dev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+ retval = dma_set_coherent_mask(&dev->dev, DMA_BIT_MASK(32));
+ if (retval)
+ return retval;
irq = platform_get_irq(dev, 0);
if (irq < 0) {
pci_set_master(pdev);
/* Check the DMA addressing support of this device */
- if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(64))) {
- rc = dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
- if (rc < 0) {
- dev_err(&pdev->dev,
- "Unable to obtain 64 bit DMA for consistent allocations\n");
- goto err_release_res;
- }
- } else if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(32))) {
- rc = dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32));
- if (rc < 0) {
- dev_err(&pdev->dev,
- "Unable to obtain 32 bit DMA for consistent allocations\n");
- goto err_release_res;
- }
- } else {
+ if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) &&
+ dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) {
dev_err(&pdev->dev, "No usable DMA addressing method\n");
rc = -EIO;
goto err_release_res;
static int imx_drm_platform_probe(struct platform_device *pdev)
{
+ int ret;
+
+ ret = dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
+
imx_drm_device->dev = &pdev->dev;
return drm_platform_init(&imx_drm_driver, pdev);
goto err_pdev;
}
- imx_drm_pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32),
-
ret = platform_driver_register(&imx_drm_pdrv);
if (ret)
goto err_pdrv;
if (!pdata)
return -EINVAL;
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
ipu_crtc = devm_kzalloc(&pdev->dev, sizeof(*ipu_crtc), GFP_KERNEL);
if (!ipu_crtc)
int err;
struct dt3155_priv *pd;
- err = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
- if (err)
- return -ENODEV;
- err = dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32));
+ err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
if (err)
return -ENODEV;
pd = kzalloc(sizeof(*pd), GFP_KERNEL);
pdata.phy = data->phy;
- if (!pdev->dev.dma_mask)
- pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
- if (!pdev->dev.coherent_dma_mask)
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (ret)
+ goto err_clk;
if (data->usbmisc_data) {
ret = imx_usbmisc_init(data->usbmisc_data);
* Since shared usb code relies on it, set it here for now.
* Once we move to full device tree support this will vanish off.
*/
- if (!dev->dma_mask)
- dev->dma_mask = &dev->coherent_dma_mask;
- if (!dev->coherent_dma_mask)
- dev->coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_coerce_mask_and_coherent(dev, DMA_BIT_MASK(32));
+ if (ret)
+ goto err1;
platform_set_drvdata(pdev, exynos);
udc->isp1301_i2c_client->addr);
pdev->dev.dma_mask = &lpc32xx_usbd_dmamask;
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+ retval = dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32));
+ if (retval)
+ goto resource_fail;
udc->board = &lpc32xx_usbddata;
/* TODO: Probably need checks here; is the core connected? */
- if (dma_set_mask(dev->dma_dev, DMA_BIT_MASK(32)) ||
- dma_set_coherent_mask(dev->dma_dev, DMA_BIT_MASK(32)))
+ if (dma_set_mask_and_coherent(dev->dma_dev, DMA_BIT_MASK(32)))
return -EOPNOTSUPP;
usb_dev = kzalloc(sizeof(struct bcma_hcd_device), GFP_KERNEL);
* Since shared usb code relies on it, set it here for now.
* Once we have dma capability bindings this can go away.
*/
- if (!pdev->dev.dma_mask)
- pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
- if (!pdev->dev.coherent_dma_mask)
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+ retval = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (retval)
+ goto fail_create_hcd;
hcd = usb_create_hcd(driver, &pdev->dev, dev_name(&pdev->dev));
if (!hcd) {
* We can DMA from anywhere. But the descriptors must be in
* the lower 4GB.
*/
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
pdev->dev.dma_mask = &ehci_octeon_dma_mask;
+ ret = dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
hcd = usb_create_hcd(&ehci_octeon_hc_driver, &pdev->dev, "octeon");
if (!hcd)
struct resource *res;
struct usb_hcd *hcd;
void __iomem *regs;
- int ret = -ENODEV;
+ int ret;
int irq;
int i;
struct omap_hcd *omap;
* Since shared usb code relies on it, set it here for now.
* Once we have dma capability bindings this can go away.
*/
- if (!dev->dma_mask)
- dev->dma_mask = &dev->coherent_dma_mask;
- if (!dev->coherent_dma_mask)
- dev->coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_coerce_mask_and_coherent(dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
+ ret = -ENODEV;
hcd = usb_create_hcd(&ehci_omap_hc_driver, dev,
dev_name(dev));
if (!hcd) {
* set. Since shared usb code relies on it, set it here for
* now. Once we have dma capability bindings this can go away.
*/
- if (!pdev->dev.dma_mask)
- pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
- if (!pdev->dev.coherent_dma_mask)
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+ err = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (err)
+ goto err1;
if (!request_mem_region(res->start, resource_size(res),
ehci_orion_hc_driver.description)) {
struct resource *res_mem;
struct usb_ehci_pdata *pdata;
int irq;
- int err = -ENOMEM;
+ int err;
if (usb_disabled())
return -ENODEV;
*/
if (!dev_get_platdata(&dev->dev))
dev->dev.platform_data = &ehci_platform_defaults;
- if (!dev->dev.dma_mask)
- dev->dev.dma_mask = &dev->dev.coherent_dma_mask;
- if (!dev->dev.coherent_dma_mask)
- dev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+
+ err = dma_coerce_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32));
+ if (err)
+ return err;
pdata = dev_get_platdata(&dev->dev);
* Since shared usb code relies on it, set it here for now.
* Once we move to full device tree support this will vanish off.
*/
- if (!pdev->dev.dma_mask)
- pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
- if (!pdev->dev.coherent_dma_mask)
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+ err = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (err)
+ return err;
s5p_setup_vbus_gpio(pdev);
* Since shared usb code relies on it, set it here for now.
* Once we have dma capability bindings this can go away.
*/
- if (!pdev->dev.dma_mask)
- pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
- if (!pdev->dev.coherent_dma_mask)
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+ retval = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (retval)
+ goto fail;
usbh_clk = devm_clk_get(&pdev->dev, NULL);
if (IS_ERR(usbh_clk)) {
* Since shared usb code relies on it, set it here for now.
* Once we have dma capability bindings this can go away.
*/
- if (!pdev->dev.dma_mask)
- pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
- if (!pdev->dev.coherent_dma_mask)
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+ err = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (err)
+ return err;
hcd = usb_create_hcd(&tegra_ehci_hc_driver, &pdev->dev,
dev_name(&pdev->dev));
static int ohci_at91_of_init(struct platform_device *pdev)
{
struct device_node *np = pdev->dev.of_node;
- int i, gpio;
+ int i, gpio, ret;
enum of_gpio_flags flags;
struct at91_usbh_data *pdata;
u32 ports;
* Since shared usb code relies on it, set it here for now.
* Once we have dma capability bindings this can go away.
*/
- if (!pdev->dev.dma_mask)
- pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
- if (!pdev->dev.coherent_dma_mask)
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
if (!pdata)
* Since shared usb code relies on it, set it here for now.
* Once we move to full device tree support this will vanish off.
*/
- if (!pdev->dev.dma_mask)
- pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
- if (!pdev->dev.coherent_dma_mask)
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+ err = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (err)
+ return err;
exynos_ohci = devm_kzalloc(&pdev->dev, sizeof(struct exynos_ohci_hcd),
GFP_KERNEL);
return -EPROBE_DEFER;
}
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
- pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
+ ret = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (ret)
+ goto fail_disable;
dev_dbg(&pdev->dev, "%s: " DRIVER_DESC " (nxp)\n", hcd_name);
if (usb_disabled()) {
}
/* Ohci is a 32-bit device. */
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
- pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
+ ret = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
hcd = usb_create_hcd(&ohci_octeon_hc_driver, &pdev->dev, "octeon");
if (!hcd)
struct usb_hcd *hcd = NULL;
void __iomem *regs = NULL;
struct resource *res;
- int ret = -ENODEV;
+ int ret;
int irq;
if (usb_disabled())
* Since shared usb code relies on it, set it here for now.
* Once we have dma capability bindings this can go away.
*/
- if (!dev->dma_mask)
- dev->dma_mask = &dev->coherent_dma_mask;
- if (!dev->coherent_dma_mask)
- dev->coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_coerce_mask_and_coherent(dev, DMA_BIT_MASK(32));
+ if (ret)
+ goto err_io;
+ ret = -ENODEV;
hcd = usb_create_hcd(&ohci_omap3_hc_driver, dev,
dev_name(dev));
if (!hcd) {
struct device_node *np = pdev->dev.of_node;
struct pxaohci_platform_data *pdata;
u32 tmp;
+ int ret;
if (!np)
return 0;
* Since shared usb code relies on it, set it here for now.
* Once we have dma capability bindings this can go away.
*/
- if (!pdev->dev.dma_mask)
- pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
- if (!pdev->dev.coherent_dma_mask)
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
if (!pdata)
if (usb_disabled())
return -ENODEV;
+ /*
+ * We don't call dma_set_mask_and_coherent() here because the
+ * DMA mask has already been appropraitely setup by the core
+ * SA-1111 bus code (which includes bug workarounds.)
+ */
+
hcd = usb_create_hcd(&ohci_sa1111_hc_driver, &dev->dev, "sa1111");
if (!hcd)
return -ENOMEM;
* Since shared usb code relies on it, set it here for now.
* Once we have dma capability bindings this can go away.
*/
- if (!pdev->dev.dma_mask)
- pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
- if (!pdev->dev.coherent_dma_mask)
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+ retval = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (retval)
+ goto fail;
usbh_clk = devm_clk_get(&pdev->dev, NULL);
if (IS_ERR(usbh_clk)) {
/* TODO: Probably need checks here; is the core connected? */
- if (dma_set_mask(dev->dma_dev, DMA_BIT_MASK(32)) ||
- dma_set_coherent_mask(dev->dma_dev, DMA_BIT_MASK(32)))
+ if (dma_set_mask_and_coherent(dev->dma_dev, DMA_BIT_MASK(32)))
return -EOPNOTSUPP;
usb_dev = kzalloc(sizeof(struct ssb_hcd_device), GFP_KERNEL);
* Since shared usb code relies on it, set it here for now.
* Once we have dma capability bindings this can go away.
*/
- if (!pdev->dev.dma_mask)
- pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
- if (!pdev->dev.coherent_dma_mask)
- pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
hcd = usb_create_hcd(&uhci_platform_hc_driver, &pdev->dev,
pdev->name);
struct clk *phy_clk;
struct clk *clk;
};
-#define glue_to_musb(g) platform_get_drvdata(g->musb)
/*
* am35x_musb_enable - enable interrupts
.set_vbus = am35x_musb_set_vbus,
};
-static u64 am35x_dmamask = DMA_BIT_MASK(32);
+static const struct platform_device_info am35x_dev_info = {
+ .name = "musb-hdrc",
+ .id = PLATFORM_DEVID_AUTO,
+ .dma_mask = DMA_BIT_MASK(32),
+};
static int am35x_probe(struct platform_device *pdev)
{
struct musb_hdrc_platform_data *pdata = dev_get_platdata(&pdev->dev);
struct platform_device *musb;
struct am35x_glue *glue;
-
+ struct platform_device_info pinfo;
struct clk *phy_clk;
struct clk *clk;
goto err0;
}
- musb = platform_device_alloc("musb-hdrc", PLATFORM_DEVID_AUTO);
- if (!musb) {
- dev_err(&pdev->dev, "failed to allocate musb device\n");
- goto err1;
- }
-
phy_clk = clk_get(&pdev->dev, "fck");
if (IS_ERR(phy_clk)) {
dev_err(&pdev->dev, "failed to get PHY clock\n");
goto err6;
}
- musb->dev.parent = &pdev->dev;
- musb->dev.dma_mask = &am35x_dmamask;
- musb->dev.coherent_dma_mask = am35x_dmamask;
-
glue->dev = &pdev->dev;
- glue->musb = musb;
glue->phy_clk = phy_clk;
glue->clk = clk;
platform_set_drvdata(pdev, glue);
- ret = platform_device_add_resources(musb, pdev->resource,
- pdev->num_resources);
- if (ret) {
- dev_err(&pdev->dev, "failed to add resources\n");
- goto err7;
- }
-
- ret = platform_device_add_data(musb, pdata, sizeof(*pdata));
- if (ret) {
- dev_err(&pdev->dev, "failed to add platform_data\n");
- goto err7;
- }
-
- ret = platform_device_add(musb);
- if (ret) {
- dev_err(&pdev->dev, "failed to register musb device\n");
+ pinfo = am35x_dev_info;
+ pinfo.parent = &pdev->dev;
+ pinfo.res = pdev->resource;
+ pinfo.num_res = pdev->num_resources;
+ pinfo.data = pdata;
+ pinfo.size_data = sizeof(*pdata);
+
+ glue->musb = musb = platform_device_register_full(&pinfo);
+ if (IS_ERR(musb)) {
+ ret = PTR_ERR(musb);
+ dev_err(&pdev->dev, "failed to register musb device: %d\n", ret);
goto err7;
}
clk_put(phy_clk);
err3:
- platform_device_put(musb);
-
-err1:
kfree(glue);
err0:
.set_vbus = da8xx_musb_set_vbus,
};
-static u64 da8xx_dmamask = DMA_BIT_MASK(32);
+static const struct platform_device_info da8xx_dev_info = {
+ .name = "musb-hdrc",
+ .id = PLATFORM_DEVID_AUTO,
+ .dma_mask = DMA_BIT_MASK(32),
+};
static int da8xx_probe(struct platform_device *pdev)
{
struct musb_hdrc_platform_data *pdata = dev_get_platdata(&pdev->dev);
struct platform_device *musb;
struct da8xx_glue *glue;
-
+ struct platform_device_info pinfo;
struct clk *clk;
int ret = -ENOMEM;
goto err0;
}
- musb = platform_device_alloc("musb-hdrc", PLATFORM_DEVID_AUTO);
- if (!musb) {
- dev_err(&pdev->dev, "failed to allocate musb device\n");
- goto err1;
- }
-
clk = clk_get(&pdev->dev, "usb20");
if (IS_ERR(clk)) {
dev_err(&pdev->dev, "failed to get clock\n");
goto err4;
}
- musb->dev.parent = &pdev->dev;
- musb->dev.dma_mask = &da8xx_dmamask;
- musb->dev.coherent_dma_mask = da8xx_dmamask;
-
glue->dev = &pdev->dev;
- glue->musb = musb;
glue->clk = clk;
pdata->platform_ops = &da8xx_ops;
musb_resources[1].end = pdev->resource[1].end;
musb_resources[1].flags = pdev->resource[1].flags;
- ret = platform_device_add_resources(musb, musb_resources,
- ARRAY_SIZE(musb_resources));
- if (ret) {
- dev_err(&pdev->dev, "failed to add resources\n");
- goto err5;
- }
-
- ret = platform_device_add_data(musb, pdata, sizeof(*pdata));
- if (ret) {
- dev_err(&pdev->dev, "failed to add platform_data\n");
- goto err5;
- }
-
- ret = platform_device_add(musb);
- if (ret) {
- dev_err(&pdev->dev, "failed to register musb device\n");
+ pinfo = da8xx_dev_info;
+ pinfo.parent = &pdev->dev;
+ pinfo.res = musb_resources;
+ pinfo.num_res = ARRAY_SIZE(musb_resources);
+ pinfo.data = pdata;
+ pinfo.size_data = sizeof(*pdata);
+
+ glue->musb = musb = platform_device_register_full(&pinfo);
+ if (IS_ERR(musb)) {
+ ret = PTR_ERR(musb);
+ dev_err(&pdev->dev, "failed to register musb device: %d\n", ret);
goto err5;
}
clk_put(clk);
err3:
- platform_device_put(musb);
-
-err1:
kfree(glue);
err0:
.set_vbus = davinci_musb_set_vbus,
};
-static u64 davinci_dmamask = DMA_BIT_MASK(32);
+static const struct platform_device_info davinci_dev_info = {
+ .name = "musb-hdrc",
+ .id = PLATFORM_DEVID_AUTO,
+ .dma_mask = DMA_BIT_MASK(32),
+};
static int davinci_probe(struct platform_device *pdev)
{
struct musb_hdrc_platform_data *pdata = dev_get_platdata(&pdev->dev);
struct platform_device *musb;
struct davinci_glue *glue;
+ struct platform_device_info pinfo;
struct clk *clk;
int ret = -ENOMEM;
goto err0;
}
- musb = platform_device_alloc("musb-hdrc", PLATFORM_DEVID_AUTO);
- if (!musb) {
- dev_err(&pdev->dev, "failed to allocate musb device\n");
- goto err1;
- }
-
clk = clk_get(&pdev->dev, "usb");
if (IS_ERR(clk)) {
dev_err(&pdev->dev, "failed to get clock\n");
goto err4;
}
- musb->dev.parent = &pdev->dev;
- musb->dev.dma_mask = &davinci_dmamask;
- musb->dev.coherent_dma_mask = davinci_dmamask;
-
glue->dev = &pdev->dev;
- glue->musb = musb;
glue->clk = clk;
pdata->platform_ops = &davinci_ops;
musb_resources[1].end = pdev->resource[1].end;
musb_resources[1].flags = pdev->resource[1].flags;
- ret = platform_device_add_resources(musb, musb_resources,
- ARRAY_SIZE(musb_resources));
- if (ret) {
- dev_err(&pdev->dev, "failed to add resources\n");
- goto err5;
- }
-
- ret = platform_device_add_data(musb, pdata, sizeof(*pdata));
- if (ret) {
- dev_err(&pdev->dev, "failed to add platform_data\n");
- goto err5;
- }
-
- ret = platform_device_add(musb);
- if (ret) {
- dev_err(&pdev->dev, "failed to register musb device\n");
+ pinfo = davinci_dev_info;
+ pinfo.parent = &pdev->dev;
+ pinfo.res = musb_resources;
+ pinfo.num_res = ARRAY_SIZE(musb_resources);
+ pinfo.data = pdata;
+ pinfo.size_data = sizeof(*pdata);
+
+ glue->musb = musb = platform_device_register_full(&pinfo);
+ if (IS_ERR(musb)) {
+ ret = PTR_ERR(musb);
+ dev_err(&pdev->dev, "failed to register musb device: %d\n", ret);
goto err5;
}
clk_put(clk);
err3:
- platform_device_put(musb);
-
-err1:
kfree(glue);
err0:
.set_vbus = tusb_musb_set_vbus,
};
-static u64 tusb_dmamask = DMA_BIT_MASK(32);
+static const struct platform_device_info tusb_dev_info = {
+ .name = "musb-hdrc",
+ .id = PLATFORM_DEVID_AUTO,
+ .dma_mask = DMA_BIT_MASK(32),
+};
static int tusb_probe(struct platform_device *pdev)
{
struct musb_hdrc_platform_data *pdata = dev_get_platdata(&pdev->dev);
struct platform_device *musb;
struct tusb6010_glue *glue;
-
+ struct platform_device_info pinfo;
int ret = -ENOMEM;
glue = kzalloc(sizeof(*glue), GFP_KERNEL);
goto err0;
}
- musb = platform_device_alloc("musb-hdrc", PLATFORM_DEVID_AUTO);
- if (!musb) {
- dev_err(&pdev->dev, "failed to allocate musb device\n");
- goto err1;
- }
-
- musb->dev.parent = &pdev->dev;
- musb->dev.dma_mask = &tusb_dmamask;
- musb->dev.coherent_dma_mask = tusb_dmamask;
-
glue->dev = &pdev->dev;
- glue->musb = musb;
pdata->platform_ops = &tusb_ops;
musb_resources[2].end = pdev->resource[2].end;
musb_resources[2].flags = pdev->resource[2].flags;
- ret = platform_device_add_resources(musb, musb_resources,
- ARRAY_SIZE(musb_resources));
- if (ret) {
- dev_err(&pdev->dev, "failed to add resources\n");
- goto err3;
- }
-
- ret = platform_device_add_data(musb, pdata, sizeof(*pdata));
- if (ret) {
- dev_err(&pdev->dev, "failed to add platform_data\n");
- goto err3;
- }
-
- ret = platform_device_add(musb);
- if (ret) {
- dev_err(&pdev->dev, "failed to register musb device\n");
+ pinfo = tusb_dev_info;
+ pinfo.parent = &pdev->dev;
+ pinfo.res = musb_resources;
+ pinfo.num_res = ARRAY_SIZE(musb_resources);
+ pinfo.data = pdata;
+ pinfo.size_data = sizeof(*pdata);
+
+ glue->musb = musb = platform_device_register_full(&pinfo);
+ if (IS_ERR(musb)) {
+ ret = PTR_ERR(musb);
+ dev_err(&pdev->dev, "failed to register musb device: %d\n", ret);
goto err3;
}
return 0;
err3:
- platform_device_put(musb);
-
-err1:
kfree(glue);
err0:
*
* ARM PrimeCell PL110 Color LCD Controller
*/
+#include <linux/dma-mapping.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/errno.h>
if (!board)
return -EINVAL;
+ ret = dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32));
+ if (ret)
+ goto out;
+
ret = amba_request_regions(dev, NULL);
if (ret) {
printk(KERN_ERR "CLCD: unable to reserve regs region\n");
struct device dev;
struct resource res;
struct clk *pclk;
- u64 dma_mask;
unsigned int periphid;
unsigned int irq[AMBA_NR_IRQS];
};
struct amba_device name##_device = { \
.dev = __AMBA_DEV(busid, data, ~0ULL), \
.res = DEFINE_RES_MEM(base, SZ_4K), \
- .dma_mask = ~0ULL, \
.irq = irqs, \
.periphid = id, \
}
}
#endif
+/*
+ * Set both the DMA mask and the coherent DMA mask to the same thing.
+ * Note that we don't check the return value from dma_set_coherent_mask()
+ * as the DMA API guarantees that the coherent DMA mask can be set to
+ * the same or smaller than the streaming DMA mask.
+ */
+static inline int dma_set_mask_and_coherent(struct device *dev, u64 mask)
+{
+ int rc = dma_set_mask(dev, mask);
+ if (rc == 0)
+ dma_set_coherent_mask(dev, mask);
+ return rc;
+}
+
+/*
+ * Similar to the above, except it deals with the case where the device
+ * does not have dev->dma_mask appropriately setup.
+ */
+static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask)
+{
+ dev->dma_mask = &dev->coherent_dma_mask;
+ return dma_set_mask_and_coherent(dev, mask);
+}
+
extern u64 dma_get_required_mask(struct device *dev);
static inline unsigned int dma_get_max_seg_size(struct device *dev)
return -EIO;
}
+#ifndef dma_max_pfn
+static inline unsigned long dma_max_pfn(struct device *dev)
+{
+ return *dev->dma_mask >> PAGE_SHIFT;
+}
+#endif
+
static inline void *dma_zalloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t flag)
{
#define GIC_DIST_TARGET 0x800
#define GIC_DIST_CONFIG 0xc00
#define GIC_DIST_SOFTINT 0xf00
+#define GIC_DIST_SGI_PENDING_CLEAR 0xf10
+#define GIC_DIST_SGI_PENDING_SET 0xf20
#define GICH_HCR 0x0
#define GICH_VTR 0x4
gic_init_bases(nr, start, dist, cpu, 0, NULL);
}
+void gic_send_sgi(unsigned int cpu_id, unsigned int irq);
+int gic_get_cpu_id(unsigned int cpu);
+void gic_migrate_target(unsigned int new_cpu_id);
+unsigned long gic_get_sgir_physaddr(void);
+
#endif /* __ASSEMBLY */
#endif
--- /dev/null
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM power
+
+#if !defined(_TRACE_POWER_CPU_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_POWER_CPU_MIGRATE_H
+
+#include <linux/tracepoint.h>
+
+#define __cpu_migrate_proto \
+ TP_PROTO(u64 timestamp, \
+ u32 cpu_hwid)
+#define __cpu_migrate_args \
+ TP_ARGS(timestamp, \
+ cpu_hwid)
+
+DECLARE_EVENT_CLASS(cpu_migrate,
+
+ __cpu_migrate_proto,
+ __cpu_migrate_args,
+
+ TP_STRUCT__entry(
+ __field(u64, timestamp )
+ __field(u32, cpu_hwid )
+ ),
+
+ TP_fast_assign(
+ __entry->timestamp = timestamp;
+ __entry->cpu_hwid = cpu_hwid;
+ ),
+
+ TP_printk("timestamp=%llu cpu_hwid=0x%08lX",
+ (unsigned long long)__entry->timestamp,
+ (unsigned long)__entry->cpu_hwid
+ )
+);
+
+#define __define_cpu_migrate_event(name) \
+ DEFINE_EVENT(cpu_migrate, cpu_migrate_##name, \
+ __cpu_migrate_proto, \
+ __cpu_migrate_args \
+ )
+
+__define_cpu_migrate_event(begin);
+__define_cpu_migrate_event(finish);
+__define_cpu_migrate_event(current);
+
+#undef __define_cpu_migrate
+#undef __cpu_migrate_proto
+#undef __cpu_migrate_args
+
+/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */
+#ifndef _PWR_CPU_MIGRATE_EVENT_AVOID_DOUBLE_DEFINING
+#define _PWR_CPU_MIGRATE_EVENT_AVOID_DOUBLE_DEFINING
+
+/*
+ * Set from_phys_cpu and to_phys_cpu to CPU_MIGRATE_ALL_CPUS to indicate
+ * a whole-cluster migration:
+ */
+#define CPU_MIGRATE_ALL_CPUS 0x80000000U
+#endif
+
+#endif /* _TRACE_POWER_CPU_MIGRATE_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE power_cpu_migrate
+#include <trace/define_trace.h>
*/
#include <linux/module.h>
+#include <linux/dma-mapping.h>
#include <linux/dmaengine.h>
#include <sound/core.h>
.mmap = pxa2xx_pcm_mmap,
};
-static u64 pxa2xx_pcm_dmamask = 0xffffffff;
-
int pxa2xx_pcm_new(struct snd_card *card, struct pxa2xx_pcm_client *client,
struct snd_pcm **rpcm)
{
pcm->private_data = client;
pcm->private_free = pxa2xx_pcm_free_dma_buffers;
- if (!card->dev->dma_mask)
- card->dev->dma_mask = &pxa2xx_pcm_dmamask;
- if (!card->dev->coherent_dma_mask)
- card->dev->coherent_dma_mask = 0xffffffff;
+ ret = dma_coerce_mask_and_coherent(card->dev, DMA_BIT_MASK(32));
+ if (ret)
+ goto out;
if (play) {
int stream = SNDRV_PCM_STREAM_PLAYBACK;
}
EXPORT_SYMBOL_GPL(atmel_pcm_mmap);
-static u64 atmel_pcm_dmamask = DMA_BIT_MASK(32);
-
int atmel_pcm_new(struct snd_soc_pcm_runtime *rtd)
{
struct snd_card *card = rtd->card->snd_card;
struct snd_pcm *pcm = rtd->pcm;
- int ret = 0;
+ int ret;
- if (!card->dev->dma_mask)
- card->dev->dma_mask = &atmel_pcm_dmamask;
- if (!card->dev->coherent_dma_mask)
- card->dev->coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_coerce_mask_and_coherent(card->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
if (pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream) {
pr_debug("atmel-pcm: allocating PCM playback DMA buffer\n");
}
}
-static u64 bf5xx_pcm_dmamask = DMA_BIT_MASK(32);
-
static int bf5xx_pcm_ac97_new(struct snd_soc_pcm_runtime *rtd)
{
struct snd_card *card = rtd->card->snd_card;
struct snd_pcm *pcm = rtd->pcm;
- int ret = 0;
+ int ret;
pr_debug("%s enter\n", __func__);
- if (!card->dev->dma_mask)
- card->dev->dma_mask = &bf5xx_pcm_dmamask;
- if (!card->dev->coherent_dma_mask)
- card->dev->coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_coerce_mask_and_coherent(card->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
if (pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream) {
ret = bf5xx_pcm_preallocate_dma_buffer(pcm,
.silence = bf5xx_pcm_silence,
};
-static u64 bf5xx_pcm_dmamask = DMA_BIT_MASK(32);
-
static int bf5xx_pcm_i2s_new(struct snd_soc_pcm_runtime *rtd)
{
struct snd_card *card = rtd->card->snd_card;
size_t size = bf5xx_pcm_hardware.buffer_bytes_max;
+ int ret;
pr_debug("%s enter\n", __func__);
- if (!card->dev->dma_mask)
- card->dev->dma_mask = &bf5xx_pcm_dmamask;
- if (!card->dev->coherent_dma_mask)
- card->dev->coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_coerce_mask_and_coherent(card->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
return snd_pcm_lib_preallocate_pages_for_all(rtd->pcm,
SNDRV_DMA_TYPE_DEV, card->dev, size, size);
}
}
-static u64 davinci_pcm_dmamask = DMA_BIT_MASK(32);
-
static int davinci_pcm_new(struct snd_soc_pcm_runtime *rtd)
{
struct snd_card *card = rtd->card->snd_card;
struct snd_pcm *pcm = rtd->pcm;
int ret;
- if (!card->dev->dma_mask)
- card->dev->dma_mask = &davinci_pcm_dmamask;
- if (!card->dev->coherent_dma_mask)
- card->dev->coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_coerce_mask_and_coherent(card->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
if (pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream) {
ret = davinci_pcm_preallocate_dma_buffer(pcm,
{
struct snd_card *card = rtd->card->snd_card;
struct snd_pcm *pcm = rtd->pcm;
- static u64 fsl_dma_dmamask = DMA_BIT_MASK(36);
int ret;
- if (!card->dev->dma_mask)
- card->dev->dma_mask = &fsl_dma_dmamask;
-
- if (!card->dev->coherent_dma_mask)
- card->dev->coherent_dma_mask = fsl_dma_dmamask;
+ ret = dma_coerce_mask_and_coherent(card->dev, DMA_BIT_MASK(36));
+ if (ret)
+ return ret;
/* Some codecs have separate DAIs for playback and capture, so we
* should allocate a DMA buffer only for the streams that are valid.
return 0;
}
-static u64 imx_pcm_dmamask = DMA_BIT_MASK(32);
-
static int imx_pcm_new(struct snd_soc_pcm_runtime *rtd)
{
struct snd_card *card = rtd->card->snd_card;
struct snd_pcm *pcm = rtd->pcm;
- int ret = 0;
+ int ret;
+
+ ret = dma_coerce_mask_and_coherent(card->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
- if (!card->dev->dma_mask)
- card->dev->dma_mask = &imx_pcm_dmamask;
- if (!card->dev->coherent_dma_mask)
- card->dev->coherent_dma_mask = DMA_BIT_MASK(32);
if (pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream) {
ret = imx_pcm_preallocate_dma_buffer(pcm,
SNDRV_PCM_STREAM_PLAYBACK);
.hw_params = psc_dma_hw_params,
};
-static u64 psc_dma_dmamask = DMA_BIT_MASK(32);
static int psc_dma_new(struct snd_soc_pcm_runtime *rtd)
{
struct snd_card *card = rtd->card->snd_card;
struct snd_pcm *pcm = rtd->pcm;
struct psc_dma *psc_dma = snd_soc_dai_get_drvdata(rtd->cpu_dai);
size_t size = psc_dma_hardware.buffer_bytes_max;
- int rc = 0;
+ int rc;
dev_dbg(rtd->platform->dev, "psc_dma_new(card=%p, dai=%p, pcm=%p)\n",
card, dai, pcm);
- if (!card->dev->dma_mask)
- card->dev->dma_mask = &psc_dma_dmamask;
- if (!card->dev->coherent_dma_mask)
- card->dev->coherent_dma_mask = DMA_BIT_MASK(32);
+ rc = dma_coerce_mask_and_coherent(card->dev, DMA_BIT_MASK(32));
+ if (rc)
+ return rc;
if (pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream) {
rc = snd_dma_alloc_pages(SNDRV_DMA_TYPE_DEV, pcm->card->dev,
}
}
-static u64 jz4740_pcm_dmamask = DMA_BIT_MASK(32);
-
static int jz4740_pcm_new(struct snd_soc_pcm_runtime *rtd)
{
struct snd_card *card = rtd->card->snd_card;
struct snd_pcm *pcm = rtd->pcm;
- int ret = 0;
-
- if (!card->dev->dma_mask)
- card->dev->dma_mask = &jz4740_pcm_dmamask;
+ int ret;
- if (!card->dev->coherent_dma_mask)
- card->dev->coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_coerce_mask_and_coherent(card->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
if (pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream) {
ret = jz4740_pcm_preallocate_dma_buffer(pcm,
.fifo_size = 0,
};
-static u64 kirkwood_dma_dmamask = DMA_BIT_MASK(32);
-
static irqreturn_t kirkwood_dma_irq(int irq, void *dev_id)
{
struct kirkwood_dma_data *priv = dev_id;
struct snd_pcm *pcm = rtd->pcm;
int ret;
- if (!card->dev->dma_mask)
- card->dev->dma_mask = &kirkwood_dma_dmamask;
- if (!card->dev->coherent_dma_mask)
- card->dev->coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_coerce_mask_and_coherent(card->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
if (pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream) {
ret = kirkwood_dma_preallocate_dma_buffer(pcm,
snd_pcm_lib_preallocate_free_for_all(pcm);
}
-static u64 nuc900_pcm_dmamask = DMA_BIT_MASK(32);
static int nuc900_dma_new(struct snd_soc_pcm_runtime *rtd)
{
struct snd_card *card = rtd->card->snd_card;
struct snd_pcm *pcm = rtd->pcm;
+ int ret;
- if (!card->dev->dma_mask)
- card->dev->dma_mask = &nuc900_pcm_dmamask;
- if (!card->dev->coherent_dma_mask)
- card->dev->coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_coerce_mask_and_coherent(card->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
snd_pcm_lib_preallocate_pages_for_all(pcm, SNDRV_DMA_TYPE_DEV,
card->dev, 4 * 1024, (4 * 1024) - 1);
.mmap = omap_pcm_mmap,
};
-static u64 omap_pcm_dmamask = DMA_BIT_MASK(64);
-
static int omap_pcm_preallocate_dma_buffer(struct snd_pcm *pcm,
int stream)
{
{
struct snd_card *card = rtd->card->snd_card;
struct snd_pcm *pcm = rtd->pcm;
- int ret = 0;
+ int ret;
- if (!card->dev->dma_mask)
- card->dev->dma_mask = &omap_pcm_dmamask;
- if (!card->dev->coherent_dma_mask)
- card->dev->coherent_dma_mask = DMA_BIT_MASK(64);
+ ret = dma_coerce_mask_and_coherent(card->dev, DMA_BIT_MASK(64));
+ if (ret)
+ return ret;
if (pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream) {
ret = omap_pcm_preallocate_dma_buffer(pcm,
.mmap = pxa2xx_pcm_mmap,
};
-static u64 pxa2xx_pcm_dmamask = DMA_BIT_MASK(32);
-
static int pxa2xx_soc_pcm_new(struct snd_soc_pcm_runtime *rtd)
{
struct snd_card *card = rtd->card->snd_card;
struct snd_pcm *pcm = rtd->pcm;
- int ret = 0;
+ int ret;
- if (!card->dev->dma_mask)
- card->dev->dma_mask = &pxa2xx_pcm_dmamask;
- if (!card->dev->coherent_dma_mask)
- card->dev->coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_coerce_mask_and_coherent(card->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
if (pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream) {
ret = pxa2xx_pcm_preallocate_dma_buffer(pcm,
snd_pcm_lib_preallocate_free_for_all(pcm);
}
-static u64 s6000_pcm_dmamask = DMA_BIT_MASK(32);
-
static int s6000_pcm_new(struct snd_soc_pcm_runtime *runtime)
{
struct snd_card *card = runtime->card->snd_card;
params = snd_soc_dai_get_dma_data(runtime->cpu_dai,
pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream);
- if (!card->dev->dma_mask)
- card->dev->dma_mask = &s6000_pcm_dmamask;
- if (!card->dev->coherent_dma_mask)
- card->dev->coherent_dma_mask = DMA_BIT_MASK(32);
+ res = dma_coerce_mask_and_coherent(card->dev, DMA_BIT_MASK(32));
+ if (res)
+ return res;
if (params->dma_in) {
s6dmac_disable_chan(DMA_MASK_DMAC(params->dma_in),
}
}
-static u64 dma_mask = DMA_BIT_MASK(32);
-
static int dma_new(struct snd_soc_pcm_runtime *rtd)
{
struct snd_card *card = rtd->card->snd_card;
struct snd_pcm *pcm = rtd->pcm;
- int ret = 0;
+ int ret;
pr_debug("Entered %s\n", __func__);
- if (!card->dev->dma_mask)
- card->dev->dma_mask = &dma_mask;
- if (!card->dev->coherent_dma_mask)
- card->dev->coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_coerce_mask_and_coherent(card->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
if (pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream) {
ret = preallocate_dma_buffer(pcm,
return 0;
}
-static u64 idma_mask = DMA_BIT_MASK(32);
-
static int idma_new(struct snd_soc_pcm_runtime *rtd)
{
struct snd_card *card = rtd->card->snd_card;
struct snd_pcm *pcm = rtd->pcm;
- int ret = 0;
+ int ret;
- if (!card->dev->dma_mask)
- card->dev->dma_mask = &idma_mask;
- if (!card->dev->coherent_dma_mask)
- card->dev->coherent_dma_mask = DMA_BIT_MASK(32);
+ ret = dma_coerce_mask_and_coherent(card->dev, DMA_BIT_MASK(32));
+ if (ret)
+ return ret;
if (pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream) {
ret = preallocate_idma_buffer(pcm,
PERF_HAVE_DWARF_REGS := 1
LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
endif
+ifndef NO_LIBUNWIND
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind.o
+endif
--- /dev/null
+#ifndef ARCH_PERF_REGS_H
+#define ARCH_PERF_REGS_H
+
+#include <stdlib.h>
+#include "../../util/types.h"
+#include <asm/perf_regs.h>
+
+#define PERF_REGS_MASK ((1ULL << PERF_REG_ARM_MAX) - 1)
+#define PERF_REG_IP PERF_REG_ARM_PC
+#define PERF_REG_SP PERF_REG_ARM_SP
+
+static inline const char *perf_reg_name(int id)
+{
+ switch (id) {
+ case PERF_REG_ARM_R0:
+ return "r0";
+ case PERF_REG_ARM_R1:
+ return "r1";
+ case PERF_REG_ARM_R2:
+ return "r2";
+ case PERF_REG_ARM_R3:
+ return "r3";
+ case PERF_REG_ARM_R4:
+ return "r4";
+ case PERF_REG_ARM_R5:
+ return "r5";
+ case PERF_REG_ARM_R6:
+ return "r6";
+ case PERF_REG_ARM_R7:
+ return "r7";
+ case PERF_REG_ARM_R8:
+ return "r8";
+ case PERF_REG_ARM_R9:
+ return "r9";
+ case PERF_REG_ARM_R10:
+ return "r10";
+ case PERF_REG_ARM_FP:
+ return "fp";
+ case PERF_REG_ARM_IP:
+ return "ip";
+ case PERF_REG_ARM_SP:
+ return "sp";
+ case PERF_REG_ARM_LR:
+ return "lr";
+ case PERF_REG_ARM_PC:
+ return "pc";
+ default:
+ return NULL;
+ }
+
+ return NULL;
+}
+
+#endif /* ARCH_PERF_REGS_H */
--- /dev/null
+
+#include <errno.h>
+#include <libunwind.h>
+#include "perf_regs.h"
+#include "../../util/unwind.h"
+
+int unwind__arch_reg_id(int regnum)
+{
+ switch (regnum) {
+ case UNW_ARM_R0:
+ return PERF_REG_ARM_R0;
+ case UNW_ARM_R1:
+ return PERF_REG_ARM_R1;
+ case UNW_ARM_R2:
+ return PERF_REG_ARM_R2;
+ case UNW_ARM_R3:
+ return PERF_REG_ARM_R3;
+ case UNW_ARM_R4:
+ return PERF_REG_ARM_R4;
+ case UNW_ARM_R5:
+ return PERF_REG_ARM_R5;
+ case UNW_ARM_R6:
+ return PERF_REG_ARM_R6;
+ case UNW_ARM_R7:
+ return PERF_REG_ARM_R7;
+ case UNW_ARM_R8:
+ return PERF_REG_ARM_R8;
+ case UNW_ARM_R9:
+ return PERF_REG_ARM_R9;
+ case UNW_ARM_R10:
+ return PERF_REG_ARM_R10;
+ case UNW_ARM_R11:
+ return PERF_REG_ARM_FP;
+ case UNW_ARM_R12:
+ return PERF_REG_ARM_IP;
+ case UNW_ARM_R13:
+ return PERF_REG_ARM_SP;
+ case UNW_ARM_R14:
+ return PERF_REG_ARM_LR;
+ case UNW_ARM_R15:
+ return PERF_REG_ARM_PC;
+ default:
+ pr_err("unwind: invalid reg id %d\n", regnum);
+ return -EINVAL;
+ }
+
+ return -EINVAL;
+}
NO_PERF_REGS := 0
LIBUNWIND_LIBS = -lunwind -lunwind-x86_64
endif
+ifeq ($(ARCH),arm)
+ NO_PERF_REGS := 0
+ LIBUNWIND_LIBS = -lunwind -lunwind-arm
+endif
ifeq ($(NO_PERF_REGS),0)
CFLAGS += -DHAVE_PERF_REGS
endif # try-cc
endif # NO_LIBELF
-# There's only x86 (both 32 and 64) support for CFI unwind so far
-ifneq ($(ARCH),x86)
+ifeq ($(LIBUNWIND_LIBS),)
NO_LIBUNWIND := 1
endif
FLAGS_UNWIND=$(LIBUNWIND_CFLAGS) $(CFLAGS) $(LIBUNWIND_LDFLAGS) $(LDFLAGS) $(EXTLIBS) $(LIBUNWIND_LIBS)
ifneq ($(call try-cc,$(SOURCE_LIBUNWIND),$(FLAGS_UNWIND),libunwind),y)
- msg := $(warning No libunwind found, disabling post unwind support. Please install libunwind-dev[el] >= 0.99);
+ msg := $(warning No libunwind found, disabling post unwind support. Please install libunwind-dev[el] >= 1.1);
NO_LIBUNWIND := 1
endif # Libunwind support
+ifneq ($(call try-cc,$(SOURCE_LIBUNWIND_DEBUG_FRAME),$(FLAGS_UNWIND),libunwind debug_frame),y)
+ msg := $(warning No debug_frame support found in libunwind);
+CFLAGS += -DNO_LIBUNWIND_DEBUG_FRAME
+endif # debug_frame support in libunwind
endif # NO_LIBUNWIND
ifndef NO_LIBUNWIND
unw_proc_info_t *pi,
int need_unwind_info, void *arg);
-
#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
int main(void)
return 0;
}
endef
+
+define SOURCE_LIBUNWIND_DEBUG_FRAME
+#include <libunwind.h>
+#include <stdlib.h>
+
+extern int
+UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
+ unw_word_t ip, unw_word_t segbase,
+ const char *obj_name, unw_word_t start,
+ unw_word_t end);
+
+#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
+
+int main(void)
+{
+ dwarf_find_debug_frame(0, NULL, 0, 0, NULL, 0, 0);
+ return 0;
+}
+endef
+
endif
ifndef NO_BACKTRACE
#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
+extern int
+UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
+ unw_word_t ip,
+ unw_word_t segbase,
+ const char *obj_name, unw_word_t start,
+ unw_word_t end);
+
+#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
+
#define DW_EH_PE_FORMAT_MASK 0x0f /* format of the encoded value */
#define DW_EH_PE_APPL_MASK 0x70 /* how the value is to be applied */
return 0;
}
-static int read_unwind_spec(struct dso *dso, struct machine *machine,
- u64 *table_data, u64 *segbase, u64 *fde_count)
+static int read_unwind_spec_eh_frame(struct dso *dso, struct machine *machine,
+ u64 *table_data, u64 *segbase,
+ u64 *fde_count)
{
int ret = -EINVAL, fd;
u64 offset;
if (fd < 0)
return -EINVAL;
+ /* Check the .eh_frame section for unwinding info */
offset = elf_section_offset(fd, ".eh_frame_hdr");
close(fd);
table_data, segbase,
fde_count);
- /* TODO .debug_frame check if eh_frame_hdr fails */
return ret;
}
+#ifndef NO_LIBUNWIND_DEBUG_FRAME
+static int read_unwind_spec_debug_frame(struct dso *dso,
+ struct machine *machine, u64 *offset)
+{
+ int fd = dso__data_fd(dso, machine);
+
+ if (fd < 0)
+ return -EINVAL;
+
+ /* Check the .debug_frame section for unwinding info */
+ *offset = elf_section_offset(fd, ".debug_frame");
+ close(fd);
+
+ if (*offset)
+ return 0;
+
+ return -EINVAL;
+}
+#endif
+
static struct map *find_map(unw_word_t ip, struct unwind_info *ui)
{
struct addr_location al;
pr_debug("unwind: find_proc_info dso %s\n", map->dso->name);
- if (read_unwind_spec(map->dso, ui->machine,
- &table_data, &segbase, &fde_count))
- return -EINVAL;
+ /* Check the .eh_frame section for unwinding info */
+ if (!read_unwind_spec_eh_frame(map->dso, ui->machine,
+ &table_data, &segbase, &fde_count)) {
+ memset(&di, 0, sizeof(di));
+ di.format = UNW_INFO_FORMAT_REMOTE_TABLE;
+ di.start_ip = map->start;
+ di.end_ip = map->end;
+ di.u.rti.segbase = map->start + segbase;
+ di.u.rti.table_data = map->start + table_data;
+ di.u.rti.table_len = fde_count * sizeof(struct table_entry)
+ / sizeof(unw_word_t);
+ return dwarf_search_unwind_table(as, ip, &di, pi,
+ need_unwind_info, arg);
+ }
+
+#ifndef NO_LIBUNWIND_DEBUG_FRAME
+ /* Check the .debug_frame section for unwinding info */
+ if (!read_unwind_spec_debug_frame(map->dso, ui->machine, &segbase)) {
+ memset(&di, 0, sizeof(di));
+ dwarf_find_debug_frame(0, &di, ip, 0, map->dso->name,
+ map->start, map->end);
+ return dwarf_search_unwind_table(as, ip, &di, pi,
+ need_unwind_info, arg);
+ }
+#endif
- memset(&di, 0, sizeof(di));
- di.format = UNW_INFO_FORMAT_REMOTE_TABLE;
- di.start_ip = map->start;
- di.end_ip = map->end;
- di.u.rti.segbase = map->start + segbase;
- di.u.rti.table_data = map->start + table_data;
- di.u.rti.table_len = fde_count * sizeof(struct table_entry)
- / sizeof(unw_word_t);
- return dwarf_search_unwind_table(as, ip, &di, pi,
- need_unwind_info, arg);
+ return -EINVAL;
}
static int access_fpreg(unw_addr_space_t __maybe_unused as,