From e1367daf3eed5cd619ee88c9907e1e6ddaa58406 Mon Sep 17 00:00:00 2001 From: Li Shaohua Date: Sat, 25 Jun 2005 14:54:56 -0700 Subject: [PATCH] [PATCH] cpu state clean after hot remove Clean CPU states in order to reuse smp boot code for CPU hotplug. Signed-off-by: Li Shaohua Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/cpu/common.c | 12 +++ arch/i386/kernel/irq.c | 5 + arch/i386/kernel/process.c | 20 ++-- arch/i386/kernel/smpboot.c | 175 ++++++++++++++++++++++++++++------ drivers/base/cpu.c | 8 +- include/asm-i386/irq.h | 2 + include/asm-i386/smp.h | 8 ++ 7 files changed, 187 insertions(+), 43 deletions(-) diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index aac74758caf4..2203a9d20212 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -651,3 +651,15 @@ void __devinit cpu_init(void) clear_used_math(); mxcsr_feature_mask_init(); } + +#ifdef CONFIG_HOTPLUG_CPU +void __devinit cpu_uninit(void) +{ + int cpu = raw_smp_processor_id(); + cpu_clear(cpu, cpu_initialized); + + /* lazy TLB state */ + per_cpu(cpu_tlbstate, cpu).state = 0; + per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; +} +#endif diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index af115004aec5..ce66dcc26d90 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -156,6 +156,11 @@ void irq_ctx_init(int cpu) cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); } +void irq_ctx_exit(int cpu) +{ + hardirq_ctx[cpu] = NULL; +} + extern asmlinkage void __do_softirq(void); asmlinkage void do_softirq(void) diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index e06f2dc7123d..5f8cfa6b7940 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -152,21 +152,19 @@ static void poll_idle (void) /* We don't actually take CPU down, just spin without interrupts. */ static inline void play_dead(void) { + /* This must be done before dead CPU ack */ + cpu_exit_clear(); + wbinvd(); + mb(); /* Ack it */ __get_cpu_var(cpu_state) = CPU_DEAD; - /* We shouldn't have to disable interrupts while dead, but - * some interrupts just don't seem to go away, and this makes - * it "work" for testing purposes. */ - /* Death loop */ - while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE) - cpu_relax(); - + /* + * With physical CPU hotplug, we should halt the cpu + */ local_irq_disable(); - __flush_tlb_all(); - cpu_set(smp_processor_id(), cpu_online_map); - enable_APIC_timer(); - local_irq_enable(); + while (1) + __asm__ __volatile__("hlt":::"memory"); } #else static inline void play_dead(void) diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index fb0b200d1d85..d66bf489a2e9 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -90,6 +90,12 @@ cpumask_t cpu_callout_map; EXPORT_SYMBOL(cpu_callout_map); static cpumask_t smp_commenced_mask; +/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there + * is no way to resync one AP against BP. TBD: for prescott and above, we + * should use IA64's algorithm + */ +static int __devinitdata tsc_sync_disabled; + /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; EXPORT_SYMBOL(cpu_data); @@ -427,7 +433,7 @@ static void __devinit smp_callin(void) /* * Synchronize the TSC with the BP */ - if (cpu_has_tsc && cpu_khz) + if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled) synchronize_tsc_ap(); } @@ -507,6 +513,7 @@ static void __devinit start_secondary(void *unused) lock_ipi_call_lock(); cpu_set(smp_processor_id(), cpu_online_map); unlock_ipi_call_lock(); + per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; /* We can take interrupts now: we're officially "up". */ local_irq_enable(); @@ -816,8 +823,43 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) #endif /* WAKE_SECONDARY_VIA_INIT */ extern cpumask_t cpu_initialized; +static inline int alloc_cpu_id(void) +{ + cpumask_t tmp_map; + int cpu; + cpus_complement(tmp_map, cpu_present_map); + cpu = first_cpu(tmp_map); + if (cpu >= NR_CPUS) + return -ENODEV; + return cpu; +} + +#ifdef CONFIG_HOTPLUG_CPU +static struct task_struct * __devinitdata cpu_idle_tasks[NR_CPUS]; +static inline struct task_struct * alloc_idle_task(int cpu) +{ + struct task_struct *idle; + + if ((idle = cpu_idle_tasks[cpu]) != NULL) { + /* initialize thread_struct. we really want to avoid destroy + * idle tread + */ + idle->thread.esp = (unsigned long)(((struct pt_regs *) + (THREAD_SIZE + (unsigned long) idle->thread_info)) - 1); + init_idle(idle, cpu); + return idle; + } + idle = fork_idle(cpu); + + if (!IS_ERR(idle)) + cpu_idle_tasks[cpu] = idle; + return idle; +} +#else +#define alloc_idle_task(cpu) fork_idle(cpu) +#endif -static int __devinit do_boot_cpu(int apicid) +static int __devinit do_boot_cpu(int apicid, int cpu) /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * (ie clustered apic addressing mode), this is a LOGICAL apic ID. @@ -826,16 +868,17 @@ static int __devinit do_boot_cpu(int apicid) { struct task_struct *idle; unsigned long boot_error; - int timeout, cpu; + int timeout; unsigned long start_eip; unsigned short nmi_high = 0, nmi_low = 0; - cpu = ++cpucount; + ++cpucount; + /* * We can't use kernel_thread since we must avoid to * reschedule the child. */ - idle = fork_idle(cpu); + idle = alloc_idle_task(cpu); if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); idle->thread.eip = (unsigned long) start_secondary; @@ -902,13 +945,16 @@ static int __devinit do_boot_cpu(int apicid) inquire_remote_apic(apicid); } } - x86_cpu_to_apicid[cpu] = apicid; + if (boot_error) { /* Try to put things back the way they were before ... */ unmap_cpu_to_logical_apicid(cpu); cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ cpucount--; + } else { + x86_cpu_to_apicid[cpu] = apicid; + cpu_set(cpu, cpu_present_map); } /* mark "stuck" area as not stuck */ @@ -917,6 +963,75 @@ static int __devinit do_boot_cpu(int apicid) return boot_error; } +#ifdef CONFIG_HOTPLUG_CPU +void cpu_exit_clear(void) +{ + int cpu = raw_smp_processor_id(); + + idle_task_exit(); + + cpucount --; + cpu_uninit(); + irq_ctx_exit(cpu); + + cpu_clear(cpu, cpu_callout_map); + cpu_clear(cpu, cpu_callin_map); + cpu_clear(cpu, cpu_present_map); + + cpu_clear(cpu, smp_commenced_mask); + unmap_cpu_to_logical_apicid(cpu); +} + +struct warm_boot_cpu_info { + struct completion *complete; + int apicid; + int cpu; +}; + +static void __devinit do_warm_boot_cpu(void *p) +{ + struct warm_boot_cpu_info *info = p; + do_boot_cpu(info->apicid, info->cpu); + complete(info->complete); +} + +int __devinit smp_prepare_cpu(int cpu) +{ + DECLARE_COMPLETION(done); + struct warm_boot_cpu_info info; + struct work_struct task; + int apicid, ret; + + lock_cpu_hotplug(); + apicid = x86_cpu_to_apicid[cpu]; + if (apicid == BAD_APICID) { + ret = -ENODEV; + goto exit; + } + + info.complete = &done; + info.apicid = apicid; + info.cpu = cpu; + INIT_WORK(&task, do_warm_boot_cpu, &info); + + tsc_sync_disabled = 1; + + /* init low mem mapping */ + memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, + sizeof(swapper_pg_dir[0]) * KERNEL_PGD_PTRS); + flush_tlb_all(); + schedule_work(&task); + wait_for_completion(&done); + + tsc_sync_disabled = 0; + zap_low_mappings(); + ret = 0; +exit: + unlock_cpu_hotplug(); + return ret; +} +#endif + static void smp_tune_scheduling (void) { unsigned long cachesize; /* kB */ @@ -1069,7 +1184,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) if (max_cpus <= cpucount+1) continue; - if (do_boot_cpu(apicid)) + if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu)) printk("CPU #%d not responding - cannot use it.\n", apicid); else @@ -1149,25 +1264,24 @@ void __devinit smp_prepare_boot_cpu(void) { cpu_set(smp_processor_id(), cpu_online_map); cpu_set(smp_processor_id(), cpu_callout_map); + cpu_set(smp_processor_id(), cpu_present_map); + per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; } #ifdef CONFIG_HOTPLUG_CPU - -/* must be called with the cpucontrol mutex held */ -static int __devinit cpu_enable(unsigned int cpu) +static void +remove_siblinginfo(int cpu) { - /* get the target out of its holding state */ - per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; - wmb(); - - /* wait for the processor to ack it. timeout? */ - while (!cpu_online(cpu)) - cpu_relax(); - - fixup_irqs(cpu_online_map); - /* counter the disable in fixup_irqs() */ - local_irq_enable(); - return 0; + int sibling; + + for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) + cpu_clear(cpu, cpu_sibling_map[sibling]); + for_each_cpu_mask(sibling, cpu_core_map[cpu]) + cpu_clear(cpu, cpu_core_map[sibling]); + cpus_clear(cpu_sibling_map[cpu]); + cpus_clear(cpu_core_map[cpu]); + phys_proc_id[cpu] = BAD_APICID; + cpu_core_id[cpu] = BAD_APICID; } int __cpu_disable(void) @@ -1193,6 +1307,8 @@ int __cpu_disable(void) mdelay(1); local_irq_disable(); + remove_siblinginfo(cpu); + cpu_clear(cpu, map); fixup_irqs(map); /* It's now safe to remove this processor from the online map */ @@ -1207,8 +1323,10 @@ void __cpu_die(unsigned int cpu) for (i = 0; i < 10; i++) { /* They ack this in play_dead by setting CPU_DEAD */ - if (per_cpu(cpu_state, cpu) == CPU_DEAD) + if (per_cpu(cpu_state, cpu) == CPU_DEAD) { + printk ("CPU %d is now offline\n", cpu); return; + } current->state = TASK_UNINTERRUPTIBLE; schedule_timeout(HZ/10); } @@ -1236,15 +1354,8 @@ int __devinit __cpu_up(unsigned int cpu) return -EIO; } -#ifdef CONFIG_HOTPLUG_CPU - /* Already up, and in cpu_quiescent now? */ - if (cpu_isset(cpu, smp_commenced_mask)) { - cpu_enable(cpu); - return 0; - } -#endif - local_irq_enable(); + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; /* Unleash the CPU! */ cpu_set(cpu, smp_commenced_mask); while (!cpu_isset(cpu, cpu_online_map)) @@ -1258,10 +1369,12 @@ void __init smp_cpus_done(unsigned int max_cpus) setup_ioapic_dest(); #endif zap_low_mappings(); +#ifndef CONFIG_HOTPLUG_CPU /* * Disable executability of the SMP trampoline: */ set_kernel_exec((unsigned long)trampoline_base, trampoline_exec); +#endif } void __init smp_intr_init(void) diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 6ef3069b5710..bdd7e9f55c81 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -16,6 +16,10 @@ struct sysdev_class cpu_sysdev_class = { EXPORT_SYMBOL(cpu_sysdev_class); #ifdef CONFIG_HOTPLUG_CPU +#ifndef __HAVE_ARCH_SMP_PREPARE_CPU +#define smp_prepare_cpu(cpu) (0) +#endif + static ssize_t show_online(struct sys_device *dev, char *buf) { struct cpu *cpu = container_of(dev, struct cpu, sysdev); @@ -36,7 +40,9 @@ static ssize_t store_online(struct sys_device *dev, const char *buf, kobject_hotplug(&dev->kobj, KOBJ_OFFLINE); break; case '1': - ret = cpu_up(cpu->sysdev.id); + ret = smp_prepare_cpu(cpu->sysdev.id); + if (ret == 0) + ret = cpu_up(cpu->sysdev.id); break; default: ret = -EINVAL; diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h index e2d8bf23ad70..270f1986b19f 100644 --- a/include/asm-i386/irq.h +++ b/include/asm-i386/irq.h @@ -29,9 +29,11 @@ extern void release_vm86_irqs(struct task_struct *); #ifdef CONFIG_4KSTACKS extern void irq_ctx_init(int cpu); + extern void irq_ctx_exit(int cpu); # define __ARCH_HAS_DO_SOFTIRQ #else # define irq_ctx_init(cpu) do { } while (0) +# define irq_ctx_exit(cpu) do { } while (0) #endif #ifdef CONFIG_IRQBALANCE diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h index 2451ead0ca5c..c9996eda5408 100644 --- a/include/asm-i386/smp.h +++ b/include/asm-i386/smp.h @@ -48,6 +48,14 @@ extern void unlock_ipi_call_lock(void); #define MAX_APICID 256 extern u8 x86_cpu_to_apicid[]; +#ifdef CONFIG_HOTPLUG_CPU +extern void cpu_exit_clear(void); +extern void cpu_uninit(void); + +#define __HAVE_ARCH_SMP_PREPARE_CPU +extern int smp_prepare_cpu(int cpu); +#endif + /* * This function is needed by all SMP systems. It must _always_ be valid * from the initial startup. We map APIC_BASE very early in page_setup(), -- 2.39.5