3 The following code is the file support.s from the FreeBSD 2.6
4 distribution for i386. I included the entire file so you can
5 pick and choose as you like and you can pick up the license.
6 There's a generic bcopy that does overlapping, uses rep movs
7 in the largest chunk possible, etc. That might do the trick.
8 There's a few macros around but hopefully you can decipher
22 * Copyright (c) 1993 The Regents of the University of California.
23 * All rights reserved.
25 * Redistribution and use in source and binary forms, with or without
26 * modification, are permitted provided that the following conditions
28 * 1. Redistributions of source code must retain the above copyright
29 * notice, this list of conditions and the following disclaimer.
30 * 2. Redistributions in binary form must reproduce the above copyright
31 * notice, this list of conditions and the following disclaimer in the
32 * documentation and/or other materials provided with the distribution.
33 * 3. All advertising materials mentioning features or use of this software
34 * must display the following acknowledgement:
35 * This product includes software developed by the University of
36 * California, Berkeley and its contributors.
37 * 4. Neither the name of the University nor the names of its contributors
38 * may be used to endorse or promote products derived from this software
39 * without specific prior written permission.
41 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 #include <machine/asmacros.h>
60 #include <machine/cputypes.h>
61 #include <machine/pmap.h>
62 #include <machine/specialreg.h>
66 #define KDSEL 0x10 /* kernel data selector */
79 .globl _copyout_vector
81 .long _generic_copyout
82 .globl _ovbcopy_vector
85 #if defined(I586_CPU) && NNPX > 0
95 * void bzero(void *buf, u_int len)
114 #if defined(I486_CPU)
120 * do 64 byte chunks first
122 * XXX this is probably over-unrolled at least for DX2's
179 * a jump table seems to be faster than a loop or more range reductions
181 * XXX need a const section for non-text
216 #if defined(I586_CPU) && NNPX > 0
222 * The FPU register method is twice as fast as the integer register
223 * method unless the target is in the L1 cache and we pre-allocate a
224 * cache line for it (then the integer register method is 4-5 times
225 * faster). However, we never pre-allocate cache lines, since that
226 * would make the integer method 25% or more slower for the common
227 * case when the target isn't in either the L1 cache or the L2 cache.
228 * Thus we normally use the FPU register method unless the overhead
229 * would be too large.
231 cmpl $256,%ecx /* empirical; clts, fninit, smsw cost a lot */
235 * The FPU registers may belong to an application or to fastmove()
236 * or to another invocation of bcopy() or ourself in a higher level
237 * interrupt or trap handler. Preserving the registers is
238 * complicated since we avoid it if possible at all levels. We
239 * want to localize the complications even when that increases them.
240 * Here the extra work involves preserving CR0_TS in TS.
241 * `npxproc != NULL' is supposed to be the condition that all the
242 * FPU resources belong to an application, but npxproc and CR0_TS
243 * aren't set atomically enough for this condition to work in
244 * interrupt handlers.
246 * Case 1: FPU registers belong to the application: we must preserve
247 * the registers if we use them, so we only use the FPU register
248 * method if the target size is large enough to amortize the extra
249 * overhead for preserving them. CR0_TS must be preserved although
250 * it is very likely to end up as set.
252 * Case 2: FPU registers belong to fastmove(): fastmove() currently
253 * makes the registers look like they belong to an application so
254 * that cpu_switch() and savectx() don't have to know about it, so
255 * this case reduces to case 1.
257 * Case 3: FPU registers belong to the kernel: don't use the FPU
258 * register method. This case is unlikely, and supporting it would
259 * be more complicated and might take too much stack.
261 * Case 4: FPU registers don't belong to anyone: the FPU registers
262 * don't need to be preserved, so we always use the FPU register
263 * method. CR0_TS must be preserved although it is very likely to
264 * always end up as clear.
268 cmpl $256+184,%ecx /* empirical; not quite 2*108 more */
270 sarb $1,kernel_fpu_lock
279 sarb $1,kernel_fpu_lock
283 fninit /* XXX should avoid needing this */
288 * Align to an 8 byte boundary (misalignment in the main loop would
289 * cost a factor of >= 2). Avoid jumps (at little cost if it is
290 * already aligned) by always zeroing 8 bytes and using the part up
291 * to the _next_ alignment position.
294 addl %edx,%ecx /* part of %ecx -= new_%edx - %edx */
300 * Similarly align `len' to a multiple of 8.
307 * This wouldn't be any faster if it were unrolled, since the loop
308 * control instructions are much faster than the fstl and/or done
309 * in parallel with it so their overhead is insignificant.
311 fpureg_i586_bzero_loop:
316 jae fpureg_i586_bzero_loop
323 movb $0xfe,kernel_fpu_lock
329 movb $0xfe,kernel_fpu_lock
334 * `rep stos' seems to be the best method in practice for small
335 * counts. Fancy methods usually take too long to start up due
336 * to cache and BTB misses.
356 #endif /* I586_CPU && NNPX > 0 */
358 /* fillw(pat, base, cnt) */
379 cmpl %ecx,%eax /* overlapping && src < dst? */
381 cld /* nope, copy forwards */
390 addl %ecx,%edi /* copy backwards. */
411 * generic_bcopy(src, dst, cnt)
412 * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
423 cmpl %ecx,%eax /* overlapping && src < dst? */
426 shrl $2,%ecx /* copy by 32-bit words */
427 cld /* nope, copy forwards */
431 andl $3,%ecx /* any bytes left? */
440 addl %ecx,%edi /* copy backwards */
444 andl $3,%ecx /* any fractional bytes? */
448 movl 20(%esp),%ecx /* copy remainder by 32-bit words */
459 #if defined(I586_CPU) && NNPX > 0
469 cmpl %ecx,%eax /* overlapping && src < dst? */
475 sarb $1,kernel_fpu_lock
488 fninit /* XXX should avoid needing this */
493 #define DCACHE_SIZE 8192
494 cmpl $(DCACHE_SIZE-512)/2,%ecx
496 movl $(DCACHE_SIZE-512)/2,%ecx
500 jb 5f /* XXX should prefetch if %ecx >= 32 */
521 large_i586_bcopy_loop:
542 jae large_i586_bcopy_loop
554 movb $0xfe,kernel_fpu_lock
557 * This is a duplicate of the main part of generic_bcopy. See the comments
558 * there. Jumping into generic_bcopy would cost a whole 0-1 cycles and
559 * would mess up high resolution profiling.
595 #endif /* I586_CPU && NNPX > 0 */
598 * Note: memcpy does not support overlapping copies
607 shrl $2,%ecx /* copy by 32-bit words */
608 cld /* nope, copy forwards */
612 andl $3,%ecx /* any bytes left? */
620 /*****************************************************************************/
621 /* copyout and fubyte family */
622 /*****************************************************************************/
624 * Access user memory from inside the kernel. These routines and possibly
625 * the math- and DOS emulators should be the only places that do this.
627 * We have to access the memory with user's permissions, so use a segment
628 * selector with RPL 3. For writes to user space we have to additionally
629 * check the PTE for write permission, because the 386 does not check
630 * write permissions when we are executing with EPL 0. The 486 does check
631 * this if the WP bit is set in CR0, so we can use a simpler version here.
633 * These routines set curpcb->onfault for the time they execute. When a
634 * protection violation occurs inside the functions, the trap handler
635 * returns to *curpcb->onfault instead of the function.
638 /* copyout(from_kernel, to_user, len) */
643 ENTRY(generic_copyout)
645 movl $copyout_fault,PCB_ONFAULT(%eax)
652 testl %ebx,%ebx /* anything to do? */
656 * Check explicitly for non-user addresses. If 486 write protection
657 * is being used, this check is essential because we are in kernel
658 * mode so the h/w does not provide any protection against writing
663 * First, prevent address wrapping.
669 * XXX STOP USING VM_MAXUSER_ADDRESS.
670 * It is an end address, not a max, so every time it is used correctly it
671 * looks like there is an off by one error, and of course it caused an off
672 * by one error in several places.
674 cmpl $VM_MAXUSER_ADDRESS,%eax
677 #if defined(I386_CPU)
679 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
680 cmpl $CPUCLASS_386,_cpu_class
684 * We have to check each PTE for user write permission.
685 * The checking may cause a page fault, so it is important to set
686 * up everything for return via copyout_fault before here.
688 /* compute number of pages */
693 shrl $IDXSHIFT+2,%ecx
696 /* compute PTE offset for start address */
702 /* check PTE for each page */
703 leal _PTmap(%edx),%eax
706 testb $PG_V,_PTmap(%eax) /* PTE page must be valid */
708 movb _PTmap(%edx),%al
709 andb $PG_V|PG_RW|PG_U,%al /* page must be valid and user writable */
710 cmpb $PG_V|PG_RW|PG_U,%al
714 /* simulate a trap */
719 call _trapwrite /* trapwrite(addr) */
724 testl %eax,%eax /* if not ok, return EFAULT */
730 jnz 1b /* check next page */
731 #endif /* I386_CPU */
733 /* bcopy(%esi, %edi, %ebx) */
737 #if defined(I586_CPU) && NNPX > 0
756 movl %eax,PCB_ONFAULT(%edx)
765 movl $0,PCB_ONFAULT(%edx)
769 #if defined(I586_CPU) && NNPX > 0
772 * Duplicated from generic_copyout. Could be done a bit better.
775 movl $copyout_fault,PCB_ONFAULT(%eax)
782 testl %ebx,%ebx /* anything to do? */
786 * Check explicitly for non-user addresses. If 486 write protection
787 * is being used, this check is essential because we are in kernel
788 * mode so the h/w does not provide any protection against writing
793 * First, prevent address wrapping.
799 * XXX STOP USING VM_MAXUSER_ADDRESS.
800 * It is an end address, not a max, so every time it is used correctly it
801 * looks like there is an off by one error, and of course it caused an off
802 * by one error in several places.
804 cmpl $VM_MAXUSER_ADDRESS,%eax
807 /* bcopy(%esi, %edi, %ebx) */
811 * End of duplicated code.
821 #endif /* I586_CPU && NNPX > 0 */
823 /* copyin(from_user, to_kernel, len) */
828 ENTRY(generic_copyin)
830 movl $copyin_fault,PCB_ONFAULT(%eax)
833 movl 12(%esp),%esi /* caddr_t from */
834 movl 16(%esp),%edi /* caddr_t to */
835 movl 20(%esp),%ecx /* size_t len */
838 * make sure address is valid
843 cmpl $VM_MAXUSER_ADDRESS,%edx
846 #if defined(I586_CPU) && NNPX > 0
851 shrl $2,%ecx /* copy longword-wise */
856 andb $3,%cl /* copy remaining bytes */
860 #if defined(I586_CPU) && NNPX > 0
868 movl %eax,PCB_ONFAULT(%edx)
876 movl $0,PCB_ONFAULT(%edx)
880 #if defined(I586_CPU) && NNPX > 0
883 * Duplicated from generic_copyin. Could be done a bit better.
886 movl $copyin_fault,PCB_ONFAULT(%eax)
889 movl 12(%esp),%esi /* caddr_t from */
890 movl 16(%esp),%edi /* caddr_t to */
891 movl 20(%esp),%ecx /* size_t len */
894 * make sure address is valid
899 cmpl $VM_MAXUSER_ADDRESS,%edx
902 * End of duplicated code.
908 pushl %ebx /* XXX prepare for fastmove_fault */
913 #endif /* I586_CPU && NNPX > 0 */
915 #if defined(I586_CPU) && NNPX > 0
916 /* fastmove(src, dst, len)
919 len in %ecx XXX changed to on stack for profiling
920 uses %eax and %edx for tmp. storage
922 /* XXX use ENTRY() to get profiling. fastmove() is actually a non-entry. */
926 subl $PCB_SAVEFPU_SIZE+3*4,%esp
932 testl $7,%esi /* check if src addr is multiple of 8 */
935 testl $7,%edi /* check if dst addr is multiple of 8 */
938 /* if (npxproc != NULL) { */
941 /* fnsave(&curpcb->pcb_savefpu); */
943 fnsave PCB_SAVEFPU(%eax)
944 /* npxproc = NULL; */
948 /* now we own the FPU. */
951 * The process' FP state is saved in the pcb, but if we get
952 * switched, the cpu_switch() will store our FP state in the
953 * pcb. It should be possible to avoid all the copying for
954 * this, e.g., by setting a flag to tell cpu_switch() to
955 * save the state somewhere else.
957 /* tmp = curpcb->pcb_savefpu; */
963 addl $PCB_SAVEFPU,%esi
965 movl $PCB_SAVEFPU_SIZE>>2,%ecx
971 /* stop_emulating(); */
973 /* npxproc = curproc; */
977 movl $fastmove_fault,PCB_ONFAULT(%eax)
1034 /* curpcb->pcb_savefpu = tmp; */
1039 addl $PCB_SAVEFPU,%edi
1042 movl $PCB_SAVEFPU_SIZE>>2,%ecx
1049 /* start_emulating(); */
1053 /* npxproc = NULL; */
1059 movl $fastmove_tail_fault,PCB_ONFAULT(%eax)
1062 shrl $2,%ecx /* copy longword-wise */
1067 andb $3,%cl /* copy remaining bytes */
1078 addl $PCB_SAVEFPU,%edi
1081 movl $PCB_SAVEFPU_SIZE>>2,%ecx
1090 fastmove_tail_fault:
1098 movl $0,PCB_ONFAULT(%edx)
1101 #endif /* I586_CPU && NNPX > 0 */
1104 * fu{byte,sword,word} : fetch a byte (sword, word) from user memory
1108 movl $fusufault,PCB_ONFAULT(%ecx)
1109 movl 4(%esp),%edx /* from */
1111 cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address is valid */
1115 movl $0,PCB_ONFAULT(%ecx)
1119 * These two routines are called from the profiling code, potentially
1120 * at interrupt time. If they fail, that's okay, good things will
1121 * happen later. Fail all the time for now - until the trap code is
1122 * able to deal with this.
1131 movl $fusufault,PCB_ONFAULT(%ecx)
1134 cmpl $VM_MAXUSER_ADDRESS-2,%edx
1138 movl $0,PCB_ONFAULT(%ecx)
1143 movl $fusufault,PCB_ONFAULT(%ecx)
1146 cmpl $VM_MAXUSER_ADDRESS-1,%edx
1150 movl $0,PCB_ONFAULT(%ecx)
1157 movl %eax,PCB_ONFAULT(%ecx)
1162 * su{byte,sword,word}: write a byte (word, longword) to user memory
1166 movl $fusufault,PCB_ONFAULT(%ecx)
1169 #if defined(I386_CPU)
1171 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1172 cmpl $CPUCLASS_386,_cpu_class
1173 jne 2f /* we only have to set the right segment selector */
1174 #endif /* I486_CPU || I586_CPU || I686_CPU */
1176 /* XXX - page boundary crossing is still not handled */
1181 leal _PTmap(%edx),%ecx
1184 testb $PG_V,_PTmap(%ecx) /* PTE page must be valid */
1186 movb _PTmap(%edx),%dl
1187 andb $PG_V|PG_RW|PG_U,%dl /* page must be valid and user writable */
1188 cmpb $PG_V|PG_RW|PG_U,%dl
1192 /* simulate a trap */
1195 popl %edx /* remove junk parameter from stack */
1203 cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address validity */
1210 movl %eax,PCB_ONFAULT(%ecx)
1215 movl $fusufault,PCB_ONFAULT(%ecx)
1218 #if defined(I386_CPU)
1220 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1221 cmpl $CPUCLASS_386,_cpu_class
1223 #endif /* I486_CPU || I586_CPU || I686_CPU */
1225 /* XXX - page boundary crossing is still not handled */
1230 leal _PTmap(%edx),%ecx
1233 testb $PG_V,_PTmap(%ecx) /* PTE page must be valid */
1235 movb _PTmap(%edx),%dl
1236 andb $PG_V|PG_RW|PG_U,%dl /* page must be valid and user writable */
1237 cmpb $PG_V|PG_RW|PG_U,%dl
1241 /* simulate a trap */
1244 popl %edx /* remove junk parameter from stack */
1252 cmpl $VM_MAXUSER_ADDRESS-2,%edx /* verify address validity */
1258 movl _curpcb,%ecx /* restore trashed register */
1259 movl %eax,PCB_ONFAULT(%ecx)
1265 movl $fusufault,PCB_ONFAULT(%ecx)
1268 #if defined(I386_CPU)
1270 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
1271 cmpl $CPUCLASS_386,_cpu_class
1273 #endif /* I486_CPU || I586_CPU || I686_CPU */
1279 leal _PTmap(%edx),%ecx
1282 testb $PG_V,_PTmap(%ecx) /* PTE page must be valid */
1284 movb _PTmap(%edx),%dl
1285 andb $PG_V|PG_RW|PG_U,%dl /* page must be valid and user writable */
1286 cmpb $PG_V|PG_RW|PG_U,%dl
1290 /* simulate a trap */
1293 popl %edx /* remove junk parameter from stack */
1301 cmpl $VM_MAXUSER_ADDRESS-1,%edx /* verify address validity */
1307 movl _curpcb,%ecx /* restore trashed register */
1308 movl %eax,PCB_ONFAULT(%ecx)
1312 * copyinstr(from, to, maxlen, int *lencopied)
1313 * copy a string from from to to, stop when a 0 character is reached.
1314 * return ENAMETOOLONG if string is longer than maxlen, and
1315 * EFAULT on protection violations. If lencopied is non-zero,
1316 * return the actual length in *lencopied.
1322 movl $cpystrflt,PCB_ONFAULT(%ecx)
1324 movl 12(%esp),%esi /* %esi = from */
1325 movl 16(%esp),%edi /* %edi = to */
1326 movl 20(%esp),%edx /* %edx = maxlen */
1328 movl $VM_MAXUSER_ADDRESS,%eax
1330 /* make sure 'from' is within bounds */
1334 /* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
1352 /* Success -- 0 byte reached */
1357 /* edx is zero - return ENAMETOOLONG or EFAULT */
1358 cmpl $VM_MAXUSER_ADDRESS,%esi
1361 movl $ENAMETOOLONG,%eax
1368 /* set *lencopied and return %eax */
1370 movl $0,PCB_ONFAULT(%ecx)
1384 * copystr(from, to, maxlen, int *lencopied)
1390 movl 12(%esp),%esi /* %esi = from */
1391 movl 16(%esp),%edi /* %edi = to */
1392 movl 20(%esp),%edx /* %edx = maxlen */
1403 /* Success -- 0 byte reached */
1408 /* edx is zero -- return ENAMETOOLONG */
1409 movl $ENAMETOOLONG,%eax
1412 /* set *lencopied and return %eax */
1434 cld /* compare forwards */
1453 * Handling of special 386 registers and descriptor tables etc
1455 /* void lgdt(struct region_descriptor *rdp); */
1457 /* reload the descriptor table */
1461 /* flush the prefetch q */
1465 /* reload "stale" selectors */
1471 /* reload code selector by turning return into intersegmental return */
1474 # movl $KCSEL,4(%esp)
1479 * void lidt(struct region_descriptor *rdp);
1487 * void lldt(u_short sel)
1494 * void ltr(u_short sel)
1500 /* ssdtosd(*ssdp,*sdp) */
1537 /* void load_cr3(caddr_t cr3) */
1544 /*****************************************************************************/
1545 /* setjump, longjump */
1546 /*****************************************************************************/
1550 movl %ebx,(%eax) /* save ebx */
1551 movl %esp,4(%eax) /* save esp */
1552 movl %ebp,8(%eax) /* save ebp */
1553 movl %esi,12(%eax) /* save esi */
1554 movl %edi,16(%eax) /* save edi */
1555 movl (%esp),%edx /* get rta */
1556 movl %edx,20(%eax) /* save eip */
1557 xorl %eax,%eax /* return(0); */
1562 movl (%eax),%ebx /* restore ebx */
1563 movl 4(%eax),%esp /* restore esp */
1564 movl 8(%eax),%ebp /* restore ebp */
1565 movl 12(%eax),%esi /* restore esi */
1566 movl 16(%eax),%edi /* restore edi */
1567 movl 20(%eax),%edx /* get rta */
1568 movl %edx,(%esp) /* put in return frame */
1569 xorl %eax,%eax /* return(1); */
1574 * Here for doing BB-profiling (gcc -a).
1575 * We rely on the "bbset" instead, but need a dummy function.
1577 NON_GPROF_ENTRY(__bb_init_func)
1580 .byte 0xc3 /* avoid macro for `ret' */