2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
16 * Copyright (C) IBM Corporation, 2011
18 * Author: Anton Blanchard <anton@au.ibm.com>
20 #include <asm/ppc_asm.h>
24 .section __ex_table,"a"
32 .section __ex_table,"a"
41 .section __ex_table,"a"
49 .section __ex_table,"a"
57 ld r16,STK_REG(R16)(r1)
58 ld r15,STK_REG(R15)(r1)
59 ld r14,STK_REG(R14)(r1)
62 ld r0,STACKFRAMESIZE+16(r1)
65 #endif /* CONFIG_ALTIVEC */
68 ld r22,STK_REG(R22)(r1)
69 ld r21,STK_REG(R21)(r1)
70 ld r20,STK_REG(R20)(r1)
71 ld r19,STK_REG(R19)(r1)
72 ld r18,STK_REG(R18)(r1)
73 ld r17,STK_REG(R17)(r1)
74 ld r16,STK_REG(R16)(r1)
75 ld r15,STK_REG(R15)(r1)
76 ld r14,STK_REG(R14)(r1)
78 addi r1,r1,STACKFRAMESIZE
83 b __copy_tofrom_user_base
86 _GLOBAL(__copy_tofrom_user_power7)
108 /* Get the source 8B aligned */
136 stdu r1,-STACKFRAMESIZE(r1)
137 std r14,STK_REG(R14)(r1)
138 std r15,STK_REG(R15)(r1)
139 std r16,STK_REG(R16)(r1)
140 std r17,STK_REG(R17)(r1)
141 std r18,STK_REG(R18)(r1)
142 std r19,STK_REG(R19)(r1)
143 std r20,STK_REG(R20)(r1)
144 std r21,STK_REG(R21)(r1)
145 std r22,STK_REG(R22)(r1)
146 std r0,STACKFRAMESIZE+16(r1)
151 /* Now do cacheline (128B) sized loads and stores. */
184 err2; std r19,104(r3)
185 err2; std r20,112(r3)
186 err2; std r21,120(r3)
192 ld r14,STK_REG(R14)(r1)
193 ld r15,STK_REG(R15)(r1)
194 ld r16,STK_REG(R16)(r1)
195 ld r17,STK_REG(R17)(r1)
196 ld r18,STK_REG(R18)(r1)
197 ld r19,STK_REG(R19)(r1)
198 ld r20,STK_REG(R20)(r1)
199 ld r21,STK_REG(R21)(r1)
200 ld r22,STK_REG(R22)(r1)
201 addi r1,r1,STACKFRAMESIZE
203 /* Up to 127B to go */
227 /* Up to 63B to go */
240 /* Up to 31B to go */
249 9: clrldi r5,r5,(64-4)
251 /* Up to 15B to go */
255 err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
281 .Lunwind_stack_nonvmx_copy:
282 addi r1,r1,STACKFRAMESIZE
285 #ifdef CONFIG_ALTIVEC
289 stdu r1,-STACKFRAMESIZE(r1)
290 bl .enter_vmx_usercopy
292 ld r0,STACKFRAMESIZE+16(r1)
293 ld r3,STACKFRAMESIZE+48(r1)
294 ld r4,STACKFRAMESIZE+56(r1)
295 ld r5,STACKFRAMESIZE+64(r1)
299 * We prefetch both the source and destination using enhanced touch
300 * instructions. We use a stream ID of 0 for the load side and
301 * 1 for the store side.
305 ori r9,r9,1 /* stream=1 */
307 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
311 1: lis r0,0x0E00 /* depth=7 */
314 ori r10,r7,1 /* stream=1 */
316 lis r8,0x8000 /* GO=1 */
324 dcbtst r0,r10,0b01010
326 dcbt r0,r8,0b01010 /* GO */
329 beq cr1,.Lunwind_stack_nonvmx_copy
332 * If source and destination are not relatively aligned we use a
333 * slower permute loop.
336 rldicl. r6,r6,0,(64-4)
337 bne .Lvmx_unaligned_copy
339 /* Get the destination 16B aligned */
370 /* Get the desination 128B aligned */
402 err3; stvx vr1,r3,r10
403 err3; stvx vr0,r3,r11
409 std r14,STK_REG(R14)(r1)
410 std r15,STK_REG(R15)(r1)
411 std r16,STK_REG(R16)(r1)
421 * Now do cacheline sized loads and stores. By this stage the
422 * cacheline stores are also cacheline aligned.
437 err4; stvx vr5,r3,r10
438 err4; stvx vr4,r3,r11
439 err4; stvx vr3,r3,r12
440 err4; stvx vr2,r3,r14
441 err4; stvx vr1,r3,r15
442 err4; stvx vr0,r3,r16
446 ld r14,STK_REG(R14)(r1)
447 ld r15,STK_REG(R15)(r1)
448 ld r16,STK_REG(R16)(r1)
450 /* Up to 127B to go */
463 err3; stvx vr1,r3,r10
464 err3; stvx vr0,r3,r11
481 /* Up to 15B to go */
482 11: clrldi r5,r5,(64-4)
506 15: addi r1,r1,STACKFRAMESIZE
507 b .exit_vmx_usercopy /* tail call optimise */
509 .Lvmx_unaligned_copy:
510 /* Get the destination 16B aligned */
534 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
543 /* Get the desination 128B aligned */
553 lvsl vr16,0,r4 /* Setup permute control vector */
559 vperm vr8,vr0,vr1,vr16
567 vperm vr8,vr0,vr1,vr16
569 vperm vr9,vr1,vr0,vr16
577 vperm vr8,vr0,vr3,vr16
579 vperm vr9,vr3,vr2,vr16
581 vperm vr10,vr2,vr1,vr16
583 vperm vr11,vr1,vr0,vr16
587 err3; stvx vr10,r3,r10
588 err3; stvx vr11,r3,r11
594 std r14,STK_REG(R14)(r1)
595 std r15,STK_REG(R15)(r1)
596 std r16,STK_REG(R16)(r1)
606 * Now do cacheline sized loads and stores. By this stage the
607 * cacheline stores are also cacheline aligned.
612 vperm vr8,vr0,vr7,vr16
614 vperm vr9,vr7,vr6,vr16
616 vperm vr10,vr6,vr5,vr16
618 vperm vr11,vr5,vr4,vr16
620 vperm vr12,vr4,vr3,vr16
622 vperm vr13,vr3,vr2,vr16
624 vperm vr14,vr2,vr1,vr16
626 vperm vr15,vr1,vr0,vr16
630 err4; stvx vr10,r3,r10
631 err4; stvx vr11,r3,r11
632 err4; stvx vr12,r3,r12
633 err4; stvx vr13,r3,r14
634 err4; stvx vr14,r3,r15
635 err4; stvx vr15,r3,r16
639 ld r14,STK_REG(R14)(r1)
640 ld r15,STK_REG(R15)(r1)
641 ld r16,STK_REG(R16)(r1)
643 /* Up to 127B to go */
650 vperm vr8,vr0,vr3,vr16
652 vperm vr9,vr3,vr2,vr16
654 vperm vr10,vr2,vr1,vr16
656 vperm vr11,vr1,vr0,vr16
660 err3; stvx vr10,r3,r10
661 err3; stvx vr11,r3,r11
666 vperm vr8,vr0,vr1,vr16
668 vperm vr9,vr1,vr0,vr16
676 vperm vr8,vr0,vr1,vr16
681 /* Up to 15B to go */
682 11: clrldi r5,r5,(64-4)
683 addi r4,r4,-16 /* Unwind the +16 load offset */
686 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
709 15: addi r1,r1,STACKFRAMESIZE
710 b .exit_vmx_usercopy /* tail call optimise */
711 #endif /* CONFiG_ALTIVEC */