]> git.karo-electronics.de Git - karo-tx-linux.git/blob - arch/powerpc/lib/memcpy_power7.S
Merge branch 'perf/urgent' into perf/core
[karo-tx-linux.git] / arch / powerpc / lib / memcpy_power7.S
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 2 of the License, or
5  * (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software
14  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15  *
16  * Copyright (C) IBM Corporation, 2012
17  *
18  * Author: Anton Blanchard <anton@au.ibm.com>
19  */
20 #include <asm/ppc_asm.h>
21
22 _GLOBAL(memcpy_power7)
23 #ifdef CONFIG_ALTIVEC
24         cmpldi  r5,16
25         cmpldi  cr1,r5,4096
26
27         std     r3,48(r1)
28
29         blt     .Lshort_copy
30         bgt     cr1,.Lvmx_copy
31 #else
32         cmpldi  r5,16
33
34         std     r3,48(r1)
35
36         blt     .Lshort_copy
37 #endif
38
39 .Lnonvmx_copy:
40         /* Get the source 8B aligned */
41         neg     r6,r4
42         mtocrf  0x01,r6
43         clrldi  r6,r6,(64-3)
44
45         bf      cr7*4+3,1f
46         lbz     r0,0(r4)
47         addi    r4,r4,1
48         stb     r0,0(r3)
49         addi    r3,r3,1
50
51 1:      bf      cr7*4+2,2f
52         lhz     r0,0(r4)
53         addi    r4,r4,2
54         sth     r0,0(r3)
55         addi    r3,r3,2
56
57 2:      bf      cr7*4+1,3f
58         lwz     r0,0(r4)
59         addi    r4,r4,4
60         stw     r0,0(r3)
61         addi    r3,r3,4
62
63 3:      sub     r5,r5,r6
64         cmpldi  r5,128
65         blt     5f
66
67         mflr    r0
68         stdu    r1,-STACKFRAMESIZE(r1)
69         std     r14,STK_REG(R14)(r1)
70         std     r15,STK_REG(R15)(r1)
71         std     r16,STK_REG(R16)(r1)
72         std     r17,STK_REG(R17)(r1)
73         std     r18,STK_REG(R18)(r1)
74         std     r19,STK_REG(R19)(r1)
75         std     r20,STK_REG(R20)(r1)
76         std     r21,STK_REG(R21)(r1)
77         std     r22,STK_REG(R22)(r1)
78         std     r0,STACKFRAMESIZE+16(r1)
79
80         srdi    r6,r5,7
81         mtctr   r6
82
83         /* Now do cacheline (128B) sized loads and stores. */
84         .align  5
85 4:
86         ld      r0,0(r4)
87         ld      r6,8(r4)
88         ld      r7,16(r4)
89         ld      r8,24(r4)
90         ld      r9,32(r4)
91         ld      r10,40(r4)
92         ld      r11,48(r4)
93         ld      r12,56(r4)
94         ld      r14,64(r4)
95         ld      r15,72(r4)
96         ld      r16,80(r4)
97         ld      r17,88(r4)
98         ld      r18,96(r4)
99         ld      r19,104(r4)
100         ld      r20,112(r4)
101         ld      r21,120(r4)
102         addi    r4,r4,128
103         std     r0,0(r3)
104         std     r6,8(r3)
105         std     r7,16(r3)
106         std     r8,24(r3)
107         std     r9,32(r3)
108         std     r10,40(r3)
109         std     r11,48(r3)
110         std     r12,56(r3)
111         std     r14,64(r3)
112         std     r15,72(r3)
113         std     r16,80(r3)
114         std     r17,88(r3)
115         std     r18,96(r3)
116         std     r19,104(r3)
117         std     r20,112(r3)
118         std     r21,120(r3)
119         addi    r3,r3,128
120         bdnz    4b
121
122         clrldi  r5,r5,(64-7)
123
124         ld      r14,STK_REG(R14)(r1)
125         ld      r15,STK_REG(R15)(r1)
126         ld      r16,STK_REG(R16)(r1)
127         ld      r17,STK_REG(R17)(r1)
128         ld      r18,STK_REG(R18)(r1)
129         ld      r19,STK_REG(R19)(r1)
130         ld      r20,STK_REG(R20)(r1)
131         ld      r21,STK_REG(R21)(r1)
132         ld      r22,STK_REG(R22)(r1)
133         addi    r1,r1,STACKFRAMESIZE
134
135         /* Up to 127B to go */
136 5:      srdi    r6,r5,4
137         mtocrf  0x01,r6
138
139 6:      bf      cr7*4+1,7f
140         ld      r0,0(r4)
141         ld      r6,8(r4)
142         ld      r7,16(r4)
143         ld      r8,24(r4)
144         ld      r9,32(r4)
145         ld      r10,40(r4)
146         ld      r11,48(r4)
147         ld      r12,56(r4)
148         addi    r4,r4,64
149         std     r0,0(r3)
150         std     r6,8(r3)
151         std     r7,16(r3)
152         std     r8,24(r3)
153         std     r9,32(r3)
154         std     r10,40(r3)
155         std     r11,48(r3)
156         std     r12,56(r3)
157         addi    r3,r3,64
158
159         /* Up to 63B to go */
160 7:      bf      cr7*4+2,8f
161         ld      r0,0(r4)
162         ld      r6,8(r4)
163         ld      r7,16(r4)
164         ld      r8,24(r4)
165         addi    r4,r4,32
166         std     r0,0(r3)
167         std     r6,8(r3)
168         std     r7,16(r3)
169         std     r8,24(r3)
170         addi    r3,r3,32
171
172         /* Up to 31B to go */
173 8:      bf      cr7*4+3,9f
174         ld      r0,0(r4)
175         ld      r6,8(r4)
176         addi    r4,r4,16
177         std     r0,0(r3)
178         std     r6,8(r3)
179         addi    r3,r3,16
180
181 9:      clrldi  r5,r5,(64-4)
182
183         /* Up to 15B to go */
184 .Lshort_copy:
185         mtocrf  0x01,r5
186         bf      cr7*4+0,12f
187         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
188         lwz     r6,4(r4)
189         addi    r4,r4,8
190         stw     r0,0(r3)
191         stw     r6,4(r3)
192         addi    r3,r3,8
193
194 12:     bf      cr7*4+1,13f
195         lwz     r0,0(r4)
196         addi    r4,r4,4
197         stw     r0,0(r3)
198         addi    r3,r3,4
199
200 13:     bf      cr7*4+2,14f
201         lhz     r0,0(r4)
202         addi    r4,r4,2
203         sth     r0,0(r3)
204         addi    r3,r3,2
205
206 14:     bf      cr7*4+3,15f
207         lbz     r0,0(r4)
208         stb     r0,0(r3)
209
210 15:     ld      r3,48(r1)
211         blr
212
213 .Lunwind_stack_nonvmx_copy:
214         addi    r1,r1,STACKFRAMESIZE
215         b       .Lnonvmx_copy
216
217 #ifdef CONFIG_ALTIVEC
218 .Lvmx_copy:
219         mflr    r0
220         std     r4,56(r1)
221         std     r5,64(r1)
222         std     r0,16(r1)
223         stdu    r1,-STACKFRAMESIZE(r1)
224         bl      .enter_vmx_copy
225         cmpwi   cr1,r3,0
226         ld      r0,STACKFRAMESIZE+16(r1)
227         ld      r3,STACKFRAMESIZE+48(r1)
228         ld      r4,STACKFRAMESIZE+56(r1)
229         ld      r5,STACKFRAMESIZE+64(r1)
230         mtlr    r0
231
232         /*
233          * We prefetch both the source and destination using enhanced touch
234          * instructions. We use a stream ID of 0 for the load side and
235          * 1 for the store side.
236          */
237         clrrdi  r6,r4,7
238         clrrdi  r9,r3,7
239         ori     r9,r9,1         /* stream=1 */
240
241         srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
242         cmpldi  r7,0x3FF
243         ble     1f
244         li      r7,0x3FF
245 1:      lis     r0,0x0E00       /* depth=7 */
246         sldi    r7,r7,7
247         or      r7,r7,r0
248         ori     r10,r7,1        /* stream=1 */
249
250         lis     r8,0x8000       /* GO=1 */
251         clrldi  r8,r8,32
252
253 .machine push
254 .machine "power4"
255         dcbt    r0,r6,0b01000
256         dcbt    r0,r7,0b01010
257         dcbtst  r0,r9,0b01000
258         dcbtst  r0,r10,0b01010
259         eieio
260         dcbt    r0,r8,0b01010   /* GO */
261 .machine pop
262
263         beq     cr1,.Lunwind_stack_nonvmx_copy
264
265         /*
266          * If source and destination are not relatively aligned we use a
267          * slower permute loop.
268          */
269         xor     r6,r4,r3
270         rldicl. r6,r6,0,(64-4)
271         bne     .Lvmx_unaligned_copy
272
273         /* Get the destination 16B aligned */
274         neg     r6,r3
275         mtocrf  0x01,r6
276         clrldi  r6,r6,(64-4)
277
278         bf      cr7*4+3,1f
279         lbz     r0,0(r4)
280         addi    r4,r4,1
281         stb     r0,0(r3)
282         addi    r3,r3,1
283
284 1:      bf      cr7*4+2,2f
285         lhz     r0,0(r4)
286         addi    r4,r4,2
287         sth     r0,0(r3)
288         addi    r3,r3,2
289
290 2:      bf      cr7*4+1,3f
291         lwz     r0,0(r4)
292         addi    r4,r4,4
293         stw     r0,0(r3)
294         addi    r3,r3,4
295
296 3:      bf      cr7*4+0,4f
297         ld      r0,0(r4)
298         addi    r4,r4,8
299         std     r0,0(r3)
300         addi    r3,r3,8
301
302 4:      sub     r5,r5,r6
303
304         /* Get the desination 128B aligned */
305         neg     r6,r3
306         srdi    r7,r6,4
307         mtocrf  0x01,r7
308         clrldi  r6,r6,(64-7)
309
310         li      r9,16
311         li      r10,32
312         li      r11,48
313
314         bf      cr7*4+3,5f
315         lvx     vr1,r0,r4
316         addi    r4,r4,16
317         stvx    vr1,r0,r3
318         addi    r3,r3,16
319
320 5:      bf      cr7*4+2,6f
321         lvx     vr1,r0,r4
322         lvx     vr0,r4,r9
323         addi    r4,r4,32
324         stvx    vr1,r0,r3
325         stvx    vr0,r3,r9
326         addi    r3,r3,32
327
328 6:      bf      cr7*4+1,7f
329         lvx     vr3,r0,r4
330         lvx     vr2,r4,r9
331         lvx     vr1,r4,r10
332         lvx     vr0,r4,r11
333         addi    r4,r4,64
334         stvx    vr3,r0,r3
335         stvx    vr2,r3,r9
336         stvx    vr1,r3,r10
337         stvx    vr0,r3,r11
338         addi    r3,r3,64
339
340 7:      sub     r5,r5,r6
341         srdi    r6,r5,7
342
343         std     r14,STK_REG(R14)(r1)
344         std     r15,STK_REG(R15)(r1)
345         std     r16,STK_REG(R16)(r1)
346
347         li      r12,64
348         li      r14,80
349         li      r15,96
350         li      r16,112
351
352         mtctr   r6
353
354         /*
355          * Now do cacheline sized loads and stores. By this stage the
356          * cacheline stores are also cacheline aligned.
357          */
358         .align  5
359 8:
360         lvx     vr7,r0,r4
361         lvx     vr6,r4,r9
362         lvx     vr5,r4,r10
363         lvx     vr4,r4,r11
364         lvx     vr3,r4,r12
365         lvx     vr2,r4,r14
366         lvx     vr1,r4,r15
367         lvx     vr0,r4,r16
368         addi    r4,r4,128
369         stvx    vr7,r0,r3
370         stvx    vr6,r3,r9
371         stvx    vr5,r3,r10
372         stvx    vr4,r3,r11
373         stvx    vr3,r3,r12
374         stvx    vr2,r3,r14
375         stvx    vr1,r3,r15
376         stvx    vr0,r3,r16
377         addi    r3,r3,128
378         bdnz    8b
379
380         ld      r14,STK_REG(R14)(r1)
381         ld      r15,STK_REG(R15)(r1)
382         ld      r16,STK_REG(R16)(r1)
383
384         /* Up to 127B to go */
385         clrldi  r5,r5,(64-7)
386         srdi    r6,r5,4
387         mtocrf  0x01,r6
388
389         bf      cr7*4+1,9f
390         lvx     vr3,r0,r4
391         lvx     vr2,r4,r9
392         lvx     vr1,r4,r10
393         lvx     vr0,r4,r11
394         addi    r4,r4,64
395         stvx    vr3,r0,r3
396         stvx    vr2,r3,r9
397         stvx    vr1,r3,r10
398         stvx    vr0,r3,r11
399         addi    r3,r3,64
400
401 9:      bf      cr7*4+2,10f
402         lvx     vr1,r0,r4
403         lvx     vr0,r4,r9
404         addi    r4,r4,32
405         stvx    vr1,r0,r3
406         stvx    vr0,r3,r9
407         addi    r3,r3,32
408
409 10:     bf      cr7*4+3,11f
410         lvx     vr1,r0,r4
411         addi    r4,r4,16
412         stvx    vr1,r0,r3
413         addi    r3,r3,16
414
415         /* Up to 15B to go */
416 11:     clrldi  r5,r5,(64-4)
417         mtocrf  0x01,r5
418         bf      cr7*4+0,12f
419         ld      r0,0(r4)
420         addi    r4,r4,8
421         std     r0,0(r3)
422         addi    r3,r3,8
423
424 12:     bf      cr7*4+1,13f
425         lwz     r0,0(r4)
426         addi    r4,r4,4
427         stw     r0,0(r3)
428         addi    r3,r3,4
429
430 13:     bf      cr7*4+2,14f
431         lhz     r0,0(r4)
432         addi    r4,r4,2
433         sth     r0,0(r3)
434         addi    r3,r3,2
435
436 14:     bf      cr7*4+3,15f
437         lbz     r0,0(r4)
438         stb     r0,0(r3)
439
440 15:     addi    r1,r1,STACKFRAMESIZE
441         ld      r3,48(r1)
442         b       .exit_vmx_copy          /* tail call optimise */
443
444 .Lvmx_unaligned_copy:
445         /* Get the destination 16B aligned */
446         neg     r6,r3
447         mtocrf  0x01,r6
448         clrldi  r6,r6,(64-4)
449
450         bf      cr7*4+3,1f
451         lbz     r0,0(r4)
452         addi    r4,r4,1
453         stb     r0,0(r3)
454         addi    r3,r3,1
455
456 1:      bf      cr7*4+2,2f
457         lhz     r0,0(r4)
458         addi    r4,r4,2
459         sth     r0,0(r3)
460         addi    r3,r3,2
461
462 2:      bf      cr7*4+1,3f
463         lwz     r0,0(r4)
464         addi    r4,r4,4
465         stw     r0,0(r3)
466         addi    r3,r3,4
467
468 3:      bf      cr7*4+0,4f
469         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
470         lwz     r7,4(r4)
471         addi    r4,r4,8
472         stw     r0,0(r3)
473         stw     r7,4(r3)
474         addi    r3,r3,8
475
476 4:      sub     r5,r5,r6
477
478         /* Get the desination 128B aligned */
479         neg     r6,r3
480         srdi    r7,r6,4
481         mtocrf  0x01,r7
482         clrldi  r6,r6,(64-7)
483
484         li      r9,16
485         li      r10,32
486         li      r11,48
487
488         lvsl    vr16,0,r4       /* Setup permute control vector */
489         lvx     vr0,0,r4
490         addi    r4,r4,16
491
492         bf      cr7*4+3,5f
493         lvx     vr1,r0,r4
494         vperm   vr8,vr0,vr1,vr16
495         addi    r4,r4,16
496         stvx    vr8,r0,r3
497         addi    r3,r3,16
498         vor     vr0,vr1,vr1
499
500 5:      bf      cr7*4+2,6f
501         lvx     vr1,r0,r4
502         vperm   vr8,vr0,vr1,vr16
503         lvx     vr0,r4,r9
504         vperm   vr9,vr1,vr0,vr16
505         addi    r4,r4,32
506         stvx    vr8,r0,r3
507         stvx    vr9,r3,r9
508         addi    r3,r3,32
509
510 6:      bf      cr7*4+1,7f
511         lvx     vr3,r0,r4
512         vperm   vr8,vr0,vr3,vr16
513         lvx     vr2,r4,r9
514         vperm   vr9,vr3,vr2,vr16
515         lvx     vr1,r4,r10
516         vperm   vr10,vr2,vr1,vr16
517         lvx     vr0,r4,r11
518         vperm   vr11,vr1,vr0,vr16
519         addi    r4,r4,64
520         stvx    vr8,r0,r3
521         stvx    vr9,r3,r9
522         stvx    vr10,r3,r10
523         stvx    vr11,r3,r11
524         addi    r3,r3,64
525
526 7:      sub     r5,r5,r6
527         srdi    r6,r5,7
528
529         std     r14,STK_REG(R14)(r1)
530         std     r15,STK_REG(R15)(r1)
531         std     r16,STK_REG(R16)(r1)
532
533         li      r12,64
534         li      r14,80
535         li      r15,96
536         li      r16,112
537
538         mtctr   r6
539
540         /*
541          * Now do cacheline sized loads and stores. By this stage the
542          * cacheline stores are also cacheline aligned.
543          */
544         .align  5
545 8:
546         lvx     vr7,r0,r4
547         vperm   vr8,vr0,vr7,vr16
548         lvx     vr6,r4,r9
549         vperm   vr9,vr7,vr6,vr16
550         lvx     vr5,r4,r10
551         vperm   vr10,vr6,vr5,vr16
552         lvx     vr4,r4,r11
553         vperm   vr11,vr5,vr4,vr16
554         lvx     vr3,r4,r12
555         vperm   vr12,vr4,vr3,vr16
556         lvx     vr2,r4,r14
557         vperm   vr13,vr3,vr2,vr16
558         lvx     vr1,r4,r15
559         vperm   vr14,vr2,vr1,vr16
560         lvx     vr0,r4,r16
561         vperm   vr15,vr1,vr0,vr16
562         addi    r4,r4,128
563         stvx    vr8,r0,r3
564         stvx    vr9,r3,r9
565         stvx    vr10,r3,r10
566         stvx    vr11,r3,r11
567         stvx    vr12,r3,r12
568         stvx    vr13,r3,r14
569         stvx    vr14,r3,r15
570         stvx    vr15,r3,r16
571         addi    r3,r3,128
572         bdnz    8b
573
574         ld      r14,STK_REG(R14)(r1)
575         ld      r15,STK_REG(R15)(r1)
576         ld      r16,STK_REG(R16)(r1)
577
578         /* Up to 127B to go */
579         clrldi  r5,r5,(64-7)
580         srdi    r6,r5,4
581         mtocrf  0x01,r6
582
583         bf      cr7*4+1,9f
584         lvx     vr3,r0,r4
585         vperm   vr8,vr0,vr3,vr16
586         lvx     vr2,r4,r9
587         vperm   vr9,vr3,vr2,vr16
588         lvx     vr1,r4,r10
589         vperm   vr10,vr2,vr1,vr16
590         lvx     vr0,r4,r11
591         vperm   vr11,vr1,vr0,vr16
592         addi    r4,r4,64
593         stvx    vr8,r0,r3
594         stvx    vr9,r3,r9
595         stvx    vr10,r3,r10
596         stvx    vr11,r3,r11
597         addi    r3,r3,64
598
599 9:      bf      cr7*4+2,10f
600         lvx     vr1,r0,r4
601         vperm   vr8,vr0,vr1,vr16
602         lvx     vr0,r4,r9
603         vperm   vr9,vr1,vr0,vr16
604         addi    r4,r4,32
605         stvx    vr8,r0,r3
606         stvx    vr9,r3,r9
607         addi    r3,r3,32
608
609 10:     bf      cr7*4+3,11f
610         lvx     vr1,r0,r4
611         vperm   vr8,vr0,vr1,vr16
612         addi    r4,r4,16
613         stvx    vr8,r0,r3
614         addi    r3,r3,16
615
616         /* Up to 15B to go */
617 11:     clrldi  r5,r5,(64-4)
618         addi    r4,r4,-16       /* Unwind the +16 load offset */
619         mtocrf  0x01,r5
620         bf      cr7*4+0,12f
621         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
622         lwz     r6,4(r4)
623         addi    r4,r4,8
624         stw     r0,0(r3)
625         stw     r6,4(r3)
626         addi    r3,r3,8
627
628 12:     bf      cr7*4+1,13f
629         lwz     r0,0(r4)
630         addi    r4,r4,4
631         stw     r0,0(r3)
632         addi    r3,r3,4
633
634 13:     bf      cr7*4+2,14f
635         lhz     r0,0(r4)
636         addi    r4,r4,2
637         sth     r0,0(r3)
638         addi    r3,r3,2
639
640 14:     bf      cr7*4+3,15f
641         lbz     r0,0(r4)
642         stb     r0,0(r3)
643
644 15:     addi    r1,r1,STACKFRAMESIZE
645         ld      r3,48(r1)
646         b       .exit_vmx_copy          /* tail call optimise */
647 #endif /* CONFiG_ALTIVEC */