2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
22 #include <linux/kernel.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/utsname.h>
30 #include <linux/delay.h>
31 #include <linux/module.h>
32 #include <linux/ptrace.h>
33 #include <linux/notifier.h>
34 #include <linux/kprobes.h>
35 #include <linux/kdebug.h>
36 #include <linux/tick.h>
37 #include <linux/prctl.h>
38 #include <linux/uaccess.h>
40 #include <linux/ftrace.h>
41 #include <linux/dmi.h>
43 #include <asm/pgtable.h>
44 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
48 #include <asm/prctl.h>
50 #include <asm/proto.h>
53 #include <asm/syscalls.h>
55 #include <asm/debugreg.h>
57 asmlinkage extern void ret_from_fork(void);
59 DEFINE_PER_CPU(unsigned long, old_rsp);
60 static DEFINE_PER_CPU(unsigned char, is_idle);
62 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
64 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
66 void idle_notifier_register(struct notifier_block *n)
68 atomic_notifier_chain_register(&idle_notifier, n);
70 EXPORT_SYMBOL_GPL(idle_notifier_register);
72 void idle_notifier_unregister(struct notifier_block *n)
74 atomic_notifier_chain_unregister(&idle_notifier, n);
76 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
80 percpu_write(is_idle, 1);
81 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
84 static void __exit_idle(void)
86 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
88 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
91 /* Called from interrupts to signify idle end */
94 /* idle loop has pid 0 */
101 static inline void play_dead(void)
108 * The idle thread. There's no useful work to be
109 * done, so just try to conserve power and have a
110 * low exit latency (ie sit in a loop waiting for
111 * somebody to say that they'd like to reschedule)
115 current_thread_info()->status |= TS_POLLING;
118 * If we're the non-boot CPU, nothing set the stack canary up
119 * for us. CPU0 already has it initialized but no harm in
120 * doing it again. This is a good place for updating it, as
121 * we wont ever return from this function (so the invalid
122 * canaries already on the stack wont ever trigger).
124 boot_init_stack_canary();
126 /* endless idle loop with no priority at all */
128 tick_nohz_stop_sched_tick(1);
129 while (!need_resched()) {
133 if (cpu_is_offline(smp_processor_id()))
136 * Idle routines should keep interrupts disabled
137 * from here on, until they go to idle.
138 * Otherwise, idle callbacks can misfire.
142 /* Don't trace irqs off for idle */
143 stop_critical_timings();
145 start_critical_timings();
146 /* In many cases the interrupt that ended idle
147 has already called exit_idle. But some idle
148 loops can be woken up without interrupt. */
152 tick_nohz_restart_sched_tick();
153 preempt_enable_no_resched();
159 /* Prints also some state that isn't saved in the pt_regs */
160 void __show_regs(struct pt_regs *regs, int all)
162 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
163 unsigned long d0, d1, d2, d3, d6, d7;
164 unsigned int fsindex, gsindex;
165 unsigned int ds, cs, es;
170 board = dmi_get_system_info(DMI_PRODUCT_NAME);
173 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
174 current->pid, current->comm, print_tainted(),
175 init_utsname()->release,
176 (int)strcspn(init_utsname()->version, " "),
177 init_utsname()->version, board);
178 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
179 printk_address(regs->ip, 1);
180 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
181 regs->sp, regs->flags);
182 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
183 regs->ax, regs->bx, regs->cx);
184 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
185 regs->dx, regs->si, regs->di);
186 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
187 regs->bp, regs->r8, regs->r9);
188 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
189 regs->r10, regs->r11, regs->r12);
190 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
191 regs->r13, regs->r14, regs->r15);
193 asm("movl %%ds,%0" : "=r" (ds));
194 asm("movl %%cs,%0" : "=r" (cs));
195 asm("movl %%es,%0" : "=r" (es));
196 asm("movl %%fs,%0" : "=r" (fsindex));
197 asm("movl %%gs,%0" : "=r" (gsindex));
199 rdmsrl(MSR_FS_BASE, fs);
200 rdmsrl(MSR_GS_BASE, gs);
201 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
211 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
212 fs, fsindex, gs, gsindex, shadowgs);
213 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
215 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
221 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
225 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
228 void show_regs(struct pt_regs *regs)
230 show_registers(regs);
231 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
234 void release_thread(struct task_struct *dead_task)
237 if (dead_task->mm->context.size) {
238 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
240 dead_task->mm->context.ldt,
241 dead_task->mm->context.size);
247 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
249 struct user_desc ud = {
256 struct desc_struct *desc = t->thread.tls_array;
261 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
263 return get_desc_base(&t->thread.tls_array[tls]);
267 * This gets called before we allocate a new thread and copy
268 * the current task into it.
270 void prepare_to_copy(struct task_struct *tsk)
275 int copy_thread(unsigned long clone_flags, unsigned long sp,
276 unsigned long unused,
277 struct task_struct *p, struct pt_regs *regs)
280 struct pt_regs *childregs;
281 struct task_struct *me = current;
283 childregs = ((struct pt_regs *)
284 (THREAD_SIZE + task_stack_page(p))) - 1;
290 childregs->sp = (unsigned long)childregs;
292 p->thread.sp = (unsigned long) childregs;
293 p->thread.sp0 = (unsigned long) (childregs+1);
294 p->thread.usersp = me->thread.usersp;
296 set_tsk_thread_flag(p, TIF_FORK);
298 p->thread.fs = me->thread.fs;
299 p->thread.gs = me->thread.gs;
300 p->thread.io_bitmap_ptr = NULL;
302 savesegment(gs, p->thread.gsindex);
303 savesegment(fs, p->thread.fsindex);
304 savesegment(es, p->thread.es);
305 savesegment(ds, p->thread.ds);
308 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
310 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
311 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
312 if (!p->thread.io_bitmap_ptr) {
313 p->thread.io_bitmap_max = 0;
316 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
318 set_tsk_thread_flag(p, TIF_IO_BITMAP);
322 * Set a new TLS for the child thread?
324 if (clone_flags & CLONE_SETTLS) {
325 #ifdef CONFIG_IA32_EMULATION
326 if (test_thread_flag(TIF_IA32))
327 err = do_set_thread_area(p, -1,
328 (struct user_desc __user *)childregs->si, 0);
331 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
336 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
337 p->thread.ds_ctx = NULL;
339 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
340 p->thread.debugctlmsr = 0;
344 if (err && p->thread.io_bitmap_ptr) {
345 kfree(p->thread.io_bitmap_ptr);
346 p->thread.io_bitmap_max = 0;
353 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
354 unsigned long new_sp,
355 unsigned int _cs, unsigned int _ss, unsigned int _ds)
358 loadsegment(es, _ds);
359 loadsegment(ds, _ds);
363 percpu_write(old_rsp, new_sp);
366 regs->flags = X86_EFLAGS_IF;
369 * Free the old FP and other extended state
371 free_thread_xstate(current);
375 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
377 start_thread_common(regs, new_ip, new_sp,
378 __USER_CS, __USER_DS, 0);
381 #ifdef CONFIG_IA32_EMULATION
382 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
384 start_thread_common(regs, new_ip, new_sp,
385 __USER32_CS, __USER32_DS, __USER32_DS);
390 * switch_to(x,y) should switch tasks from x to y.
392 * This could still be optimized:
393 * - fold all the options into a flag word and test it with a single test.
394 * - could test fs/gs bitsliced
396 * Kprobes not supported here. Set the probe on schedule instead.
397 * Function graph tracer not supported too.
399 __notrace_funcgraph struct task_struct *
400 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
402 struct thread_struct *prev = &prev_p->thread;
403 struct thread_struct *next = &next_p->thread;
404 int cpu = smp_processor_id();
405 struct tss_struct *tss = &per_cpu(init_tss, cpu);
406 unsigned fsindex, gsindex;
410 * If the task has used fpu the last 5 timeslices, just do a full
411 * restore of the math state immediately to avoid the trap; the
412 * chances of needing FPU soon are obviously high now
414 preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
416 /* we're going to use this soon, after a few expensive things */
418 prefetch(next->xstate);
421 * Reload esp0, LDT and the page table pointer:
427 * This won't pick up thread selector changes, but I guess that is ok.
429 savesegment(es, prev->es);
430 if (unlikely(next->es | prev->es))
431 loadsegment(es, next->es);
433 savesegment(ds, prev->ds);
434 if (unlikely(next->ds | prev->ds))
435 loadsegment(ds, next->ds);
438 /* We must save %fs and %gs before load_TLS() because
439 * %fs and %gs may be cleared by load_TLS().
441 * (e.g. xen_load_tls())
443 savesegment(fs, fsindex);
444 savesegment(gs, gsindex);
448 /* Must be after DS reload */
451 /* Make sure cpu is ready for new context */
456 * Leave lazy mode, flushing any hypercalls made here.
457 * This must be done before restoring TLS segments so
458 * the GDT and LDT are properly updated, and must be
459 * done before math_state_restore, so the TS bit is up
462 arch_end_context_switch(next_p);
467 * Segment register != 0 always requires a reload. Also
468 * reload when it has changed. When prev process used 64bit
469 * base always reload to avoid an information leak.
471 if (unlikely(fsindex | next->fsindex | prev->fs)) {
472 loadsegment(fs, next->fsindex);
474 * Check if the user used a selector != 0; if yes
475 * clear 64bit base, since overloaded base is always
476 * mapped to the Null selector
481 /* when next process has a 64bit base use it */
483 wrmsrl(MSR_FS_BASE, next->fs);
484 prev->fsindex = fsindex;
486 if (unlikely(gsindex | next->gsindex | prev->gs)) {
487 load_gs_index(next->gsindex);
492 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
493 prev->gsindex = gsindex;
496 * Switch the PDA and FPU contexts.
498 prev->usersp = percpu_read(old_rsp);
499 percpu_write(old_rsp, next->usersp);
500 percpu_write(current_task, next_p);
502 percpu_write(kernel_stack,
503 (unsigned long)task_stack_page(next_p) +
504 THREAD_SIZE - KERNEL_STACK_OFFSET);
507 * Now maybe reload the debug registers and handle I/O bitmaps
509 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
510 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
511 __switch_to_xtra(prev_p, next_p, tss);
514 * Preload the FPU context, now that we've determined that the
515 * task is likely to be using it.
518 __math_state_restore();
523 void set_personality_64bit(void)
525 /* inherit personality from parent */
527 /* Make sure to be in 64bit mode */
528 clear_thread_flag(TIF_IA32);
530 /* TBD: overwrites user setup. Should have two bits.
531 But 64bit processes have always behaved this way,
532 so it's not too bad. The main problem is just that
533 32bit childs are affected again. */
534 current->personality &= ~READ_IMPLIES_EXEC;
537 unsigned long get_wchan(struct task_struct *p)
543 if (!p || p == current || p->state == TASK_RUNNING)
545 stack = (unsigned long)task_stack_page(p);
546 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
548 fp = *(u64 *)(p->thread.sp);
550 if (fp < (unsigned long)stack ||
551 fp >= (unsigned long)stack+THREAD_SIZE)
554 if (!in_sched_functions(ip))
557 } while (count++ < 16);
561 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
564 int doit = task == current;
569 if (addr >= TASK_SIZE_OF(task))
572 /* handle small bases via the GDT because that's faster to
574 if (addr <= 0xffffffff) {
575 set_32bit_tls(task, GS_TLS, addr);
577 load_TLS(&task->thread, cpu);
578 load_gs_index(GS_TLS_SEL);
580 task->thread.gsindex = GS_TLS_SEL;
583 task->thread.gsindex = 0;
584 task->thread.gs = addr;
587 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
593 /* Not strictly needed for fs, but do it for symmetry
595 if (addr >= TASK_SIZE_OF(task))
598 /* handle small bases via the GDT because that's faster to
600 if (addr <= 0xffffffff) {
601 set_32bit_tls(task, FS_TLS, addr);
603 load_TLS(&task->thread, cpu);
604 loadsegment(fs, FS_TLS_SEL);
606 task->thread.fsindex = FS_TLS_SEL;
609 task->thread.fsindex = 0;
610 task->thread.fs = addr;
612 /* set the selector to 0 to not confuse
615 ret = checking_wrmsrl(MSR_FS_BASE, addr);
622 if (task->thread.fsindex == FS_TLS_SEL)
623 base = read_32bit_tls(task, FS_TLS);
625 rdmsrl(MSR_FS_BASE, base);
627 base = task->thread.fs;
628 ret = put_user(base, (unsigned long __user *)addr);
634 if (task->thread.gsindex == GS_TLS_SEL)
635 base = read_32bit_tls(task, GS_TLS);
637 savesegment(gs, gsindex);
639 rdmsrl(MSR_KERNEL_GS_BASE, base);
641 base = task->thread.gs;
643 base = task->thread.gs;
644 ret = put_user(base, (unsigned long __user *)addr);
656 long sys_arch_prctl(int code, unsigned long addr)
658 return do_arch_prctl(current, code, addr);
661 unsigned long KSTK_ESP(struct task_struct *task)
663 return (test_tsk_thread_flag(task, TIF_IA32)) ?
664 (task_pt_regs(task)->sp) : ((task)->thread.usersp);