]> git.karo-electronics.de Git - mv-sheeva.git/blobdiff - kernel/sched.c
[PATCH] debug: add sysrq_always_enabled boot option
[mv-sheeva.git] / kernel / sched.c
index 0a4a26b21f69e275a3d1e0e074f2c0c10b346a38..8a0afb97af71c99f14a01f7974a88886fd7d8f08 100644 (file)
@@ -225,7 +225,8 @@ struct rq {
        unsigned long nr_uninterruptible;
 
        unsigned long expired_timestamp;
-       unsigned long long timestamp_last_tick;
+       /* Cached timestamp set by update_cpu_clock() */
+       unsigned long long most_recent_timestamp;
        struct task_struct *curr, *idle;
        unsigned long next_balance;
        struct mm_struct *prev_mm;
@@ -427,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
  * bump this up when changing the output format or the meaning of an existing
  * format, so that tools can adapt (or abort)
  */
-#define SCHEDSTAT_VERSION 12
+#define SCHEDSTAT_VERSION 14
 
 static int show_schedstat(struct seq_file *seq, void *v)
 {
@@ -465,7 +466,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
                        seq_printf(seq, "domain%d %s", dcnt++, mask_str);
                        for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
                                        itype++) {
-                               seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
+                               seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
+                                               "%lu",
                                    sd->lb_cnt[itype],
                                    sd->lb_balanced[itype],
                                    sd->lb_failed[itype],
@@ -475,11 +477,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
                                    sd->lb_nobusyq[itype],
                                    sd->lb_nobusyg[itype]);
                        }
-                       seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+                       seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
+                           " %lu %lu %lu\n",
                            sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
                            sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
                            sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
-                           sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
+                           sd->ttwu_wake_remote, sd->ttwu_move_affine,
+                           sd->ttwu_move_balance);
                }
                preempt_enable();
 #endif
@@ -939,13 +943,16 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
 {
        unsigned long long now;
 
+       if (rt_task(p))
+               goto out;
+
        now = sched_clock();
 #ifdef CONFIG_SMP
        if (!local) {
                /* Compensate for drifting sched_clock */
                struct rq *this_rq = this_rq();
-               now = (now - this_rq->timestamp_last_tick)
-                       + rq->timestamp_last_tick;
+               now = (now - this_rq->most_recent_timestamp)
+                       + rq->most_recent_timestamp;
        }
 #endif
 
@@ -960,8 +967,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
                                     (now - p->timestamp) >> 20);
        }
 
-       if (!rt_task(p))
-               p->prio = recalc_task_prio(p, now);
+       p->prio = recalc_task_prio(p, now);
 
        /*
         * This checks to make sure it's not an uninterruptible task
@@ -986,7 +992,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
                }
        }
        p->timestamp = now;
-
+out:
        __activate_task(p, rq);
 }
 
@@ -1451,7 +1457,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
                if (this_sd->flags & SD_WAKE_AFFINE) {
                        unsigned long tl = this_load;
-                       unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
+                       unsigned long tl_per_task;
+
+                       tl_per_task = cpu_avg_load_per_task(this_cpu);
 
                        /*
                         * If sync wakeup then subtract the (maximum possible)
@@ -1689,8 +1697,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                 * Not the local CPU - must adjust timestamp. This should
                 * get optimised away in the !CONFIG_SMP case.
                 */
-               p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
-                                       + rq->timestamp_last_tick;
+               p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
+                                       + rq->most_recent_timestamp;
                __activate_task(p, rq);
                if (TASK_PREEMPTS_CURR(p, rq))
                        resched_task(rq->curr);
@@ -2068,8 +2076,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
        set_task_cpu(p, this_cpu);
        inc_nr_running(p, this_rq);
        enqueue_task(p, this_array);
-       p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
-                               + this_rq->timestamp_last_tick;
+       p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
+                               + this_rq->most_recent_timestamp;
        /*
         * Note that idle threads have a prio of MAX_PRIO, for this test
         * to be always true for them.
@@ -2105,10 +2113,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
         * 2) too many balance attempts have failed.
         */
 
-       if (sd->nr_balance_failed > sd->cache_nice_tries)
+       if (sd->nr_balance_failed > sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+               if (task_hot(p, rq->most_recent_timestamp, sd))
+                       schedstat_inc(sd, lb_hot_gained[idle]);
+#endif
                return 1;
+       }
 
-       if (task_hot(p, rq->timestamp_last_tick, sd))
+       if (task_hot(p, rq->most_recent_timestamp, sd))
                return 0;
        return 1;
 }
@@ -2206,11 +2219,6 @@ skip_queue:
                goto skip_bitmap;
        }
 
-#ifdef CONFIG_SCHEDSTATS
-       if (task_hot(tmp, busiest->timestamp_last_tick, sd))
-               schedstat_inc(sd, lb_hot_gained[idle]);
-#endif
-
        pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
        pulled++;
        rem_load_move -= tmp->load_weight;
@@ -2248,7 +2256,7 @@ out:
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
                   unsigned long *imbalance, enum idle_type idle, int *sd_idle,
-                  cpumask_t *cpus)
+                  cpumask_t *cpus, int *balance)
 {
        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2277,10 +2285,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                unsigned long load, group_capacity;
                int local_group;
                int i;
+               unsigned int balance_cpu = -1, first_idle_cpu = 0;
                unsigned long sum_nr_running, sum_weighted_load;
 
                local_group = cpu_isset(this_cpu, group->cpumask);
 
+               if (local_group)
+                       balance_cpu = first_cpu(group->cpumask);
+
                /* Tally up the load of all CPUs in the group */
                sum_weighted_load = sum_nr_running = avg_load = 0;
 
@@ -2296,9 +2308,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                                *sd_idle = 0;
 
                        /* Bias balancing toward cpus of our domain */
-                       if (local_group)
+                       if (local_group) {
+                               if (idle_cpu(i) && !first_idle_cpu) {
+                                       first_idle_cpu = 1;
+                                       balance_cpu = i;
+                               }
+
                                load = target_load(i, load_idx);
-                       else
+                       else
                                load = source_load(i, load_idx);
 
                        avg_load += load;
@@ -2306,6 +2323,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                        sum_weighted_load += rq->raw_weighted_load;
                }
 
+               /*
+                * First idle cpu or the first cpu(busiest) in this sched group
+                * is eligible for doing load balancing at this and above
+                * domains.
+                */
+               if (local_group && balance_cpu != this_cpu && balance) {
+                       *balance = 0;
+                       goto ret;
+               }
+
                total_load += avg_load;
                total_pwr += group->cpu_power;
 
@@ -2465,18 +2492,21 @@ small_imbalance:
                pwr_now /= SCHED_LOAD_SCALE;
 
                /* Amount of load we'd subtract */
-               tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
+               tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
+                       busiest->cpu_power;
                if (max_load > tmp)
                        pwr_move += busiest->cpu_power *
                                min(busiest_load_per_task, max_load - tmp);
 
                /* Amount of load we'd add */
-               if (max_load*busiest->cpu_power <
-                               busiest_load_per_task*SCHED_LOAD_SCALE)
-                       tmp = max_load*busiest->cpu_power/this->cpu_power;
+               if (max_load * busiest->cpu_power <
+                               busiest_load_per_task * SCHED_LOAD_SCALE)
+                       tmp = max_load * busiest->cpu_power / this->cpu_power;
                else
-                       tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
-               pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
+                       tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
+                               this->cpu_power;
+               pwr_move += this->cpu_power *
+                       min(this_load_per_task, this_load + tmp);
                pwr_move /= SCHED_LOAD_SCALE;
 
                /* Move if we gain throughput */
@@ -2497,8 +2527,8 @@ out_balanced:
                *imbalance = min_load_per_task;
                return group_min;
        }
-ret:
 #endif
+ret:
        *imbalance = 0;
        return NULL;
 }
@@ -2549,7 +2579,8 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
  * tasks if there is an imbalance.
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
-                       struct sched_domain *sd, enum idle_type idle)
+                       struct sched_domain *sd, enum idle_type idle,
+                       int *balance)
 {
        int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
        struct sched_group *group;
@@ -2572,7 +2603,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 redo:
        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
-                                                       &cpus);
+                                  &cpus, balance);
+
+       if (*balance == 0)
+               goto out_balanced;
+
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[idle]);
                goto out_balanced;
@@ -2714,7 +2749,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
        schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
 redo:
        group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
-                               &sd_idle, &cpus);
+                                  &sd_idle, &cpus, NULL);
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
                goto out_balanced;
@@ -2880,10 +2915,11 @@ static void update_load(struct rq *this_rq)
  *
  * Balancing parameters are set up in arch_init_sched_domains.
  */
+static DEFINE_SPINLOCK(balancing);
 
 static void run_rebalance_domains(struct softirq_action *h)
 {
-       int this_cpu = smp_processor_id();
+       int this_cpu = smp_processor_id(), balance = 1;
        struct rq *this_rq = cpu_rq(this_cpu);
        unsigned long interval;
        struct sched_domain *sd;
@@ -2909,8 +2945,13 @@ static void run_rebalance_domains(struct softirq_action *h)
                if (unlikely(!interval))
                        interval = 1;
 
+               if (sd->flags & SD_SERIALIZE) {
+                       if (!spin_trylock(&balancing))
+                               goto out;
+               }
+
                if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                       if (load_balance(this_cpu, this_rq, sd, idle)) {
+                       if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
                                /*
                                 * We've pulled tasks over so either we're no
                                 * longer idle, or one of our SMT siblings is
@@ -2920,8 +2961,19 @@ static void run_rebalance_domains(struct softirq_action *h)
                        }
                        sd->last_balance = jiffies;
                }
+               if (sd->flags & SD_SERIALIZE)
+                       spin_unlock(&balancing);
+out:
                if (time_after(next_balance, sd->last_balance + interval))
                        next_balance = sd->last_balance + interval;
+
+               /*
+                * Stop the load balance at this level. There is another
+                * CPU in our sched group which is doing load balancing more
+                * actively.
+                */
+               if (!balance)
+                       break;
        }
        this_rq->next_balance = next_balance;
 }
@@ -2962,7 +3014,8 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 static inline void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
 {
-       p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick);
+       p->sched_time += now - p->last_ran;
+       p->last_ran = rq->most_recent_timestamp = now;
 }
 
 /*
@@ -2975,8 +3028,7 @@ unsigned long long current_sched_time(const struct task_struct *p)
        unsigned long flags;
 
        local_irq_save(flags);
-       ns = max(p->timestamp, task_rq(p)->timestamp_last_tick);
-       ns = p->sched_time + sched_clock() - ns;
+       ns = p->sched_time + sched_clock() - p->last_ran;
        local_irq_restore(flags);
 
        return ns;
@@ -3167,8 +3219,6 @@ void scheduler_tick(void)
 
        update_cpu_clock(p, rq, now);
 
-       rq->timestamp_last_tick = now;
-
        if (p == rq->idle)
                /* Task on the idle queue */
                wake_priority_sleeper(rq);
@@ -3324,7 +3374,8 @@ void fastcall add_preempt_count(int val)
        /*
         * Spinlock count overflowing soon?
         */
-       DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+       DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+                               PREEMPT_MASK - 10);
 }
 EXPORT_SYMBOL(add_preempt_count);
 
@@ -5023,8 +5074,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
                 * afterwards, and pretending it was a local activate.
                 * This way is cleaner and logically correct.
                 */
-               p->timestamp = p->timestamp - rq_src->timestamp_last_tick
-                               + rq_dest->timestamp_last_tick;
+               p->timestamp = p->timestamp - rq_src->most_recent_timestamp
+                               + rq_dest->most_recent_timestamp;
                deactivate_task(p, rq_src);
                __activate_task(p, rq_dest);
                if (TASK_PREEMPTS_CURR(p, rq_dest))
@@ -5397,16 +5448,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
                if (!(sd->flags & SD_LOAD_BALANCE)) {
                        printk("does not load-balance\n");
                        if (sd->parent)
-                               printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
+                               printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+                                               " has parent");
                        break;
                }
 
                printk("span %s\n", str);
 
                if (!cpu_isset(cpu, sd->span))
-                       printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
+                       printk(KERN_ERR "ERROR: domain->span does not contain "
+                                       "CPU%d\n", cpu);
                if (!cpu_isset(cpu, group->cpumask))
-                       printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
+                       printk(KERN_ERR "ERROR: domain->groups does not contain"
+                                       " CPU%d\n", cpu);
 
                printk(KERN_DEBUG);
                for (i = 0; i < level + 2; i++)
@@ -5421,7 +5475,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 
                        if (!group->cpu_power) {
                                printk("\n");
-                               printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
+                               printk(KERN_ERR "ERROR: domain->cpu_power not "
+                                               "set\n");
                        }
 
                        if (!cpus_weight(group->cpumask)) {
@@ -5444,15 +5499,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
                printk("\n");
 
                if (!cpus_equal(sd->span, groupmask))
-                       printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+                       printk(KERN_ERR "ERROR: groups don't span "
+                                       "domain->span\n");
 
                level++;
                sd = sd->parent;
+               if (!sd)
+                       continue;
 
-               if (sd) {
-                       if (!cpus_subset(groupmask, sd->span))
-                               printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
-               }
+               if (!cpus_subset(groupmask, sd->span))
+                       printk(KERN_ERR "ERROR: parent span is not a superset "
+                               "of domain->span\n");
 
        } while (sd);
 }
@@ -5770,8 +5827,9 @@ __setup("max_cache_size=", setup_max_cache_size);
  */
 static void touch_cache(void *__cache, unsigned long __size)
 {
-       unsigned long size = __size/sizeof(long), chunk1 = size/3,
-                       chunk2 = 2*size/3;
+       unsigned long size = __size / sizeof(long);
+       unsigned long chunk1 = size / 3;
+       unsigned long chunk2 = 2 * size / 3;
        unsigned long *cache = __cache;
        int i;
 
@@ -5880,11 +5938,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
         */
        measure_one(cache, size, cpu1, cpu2);
        for (i = 0; i < ITERATIONS; i++)
-               cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);
+               cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
 
        measure_one(cache, size, cpu2, cpu1);
        for (i = 0; i < ITERATIONS; i++)
-               cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);
+               cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
 
        /*
         * (We measure the non-migrating [cached] cost on both
@@ -5894,17 +5952,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
 
        measure_one(cache, size, cpu1, cpu1);
        for (i = 0; i < ITERATIONS; i++)
-               cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);
+               cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
 
        measure_one(cache, size, cpu2, cpu2);
        for (i = 0; i < ITERATIONS; i++)
-               cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);
+               cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
 
        /*
         * Get the per-iteration migration cost:
         */
-       do_div(cost1, 2*ITERATIONS);
-       do_div(cost2, 2*ITERATIONS);
+       do_div(cost1, 2 * ITERATIONS);
+       do_div(cost2, 2 * ITERATIONS);
 
        return cost1 - cost2;
 }
@@ -5942,7 +6000,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
         */
        cache = vmalloc(max_size);
        if (!cache) {
-               printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
+               printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
                return 1000000; /* return 1 msec on very small boxen */
        }
 
@@ -5967,7 +6025,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
                avg_fluct = (avg_fluct + fluct)/2;
 
                if (migration_debug)
-                       printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",
+                       printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
+                               "(%8Ld %8Ld)\n",
                                cpu1, cpu2, size,
                                (long)cost / 1000000,
                                ((long)cost / 100000) % 10,
@@ -6062,20 +6121,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
                        -1
 #endif
                );
-       if (system_state == SYSTEM_BOOTING) {
-               if (num_online_cpus() > 1) {
-                       printk("migration_cost=");
-                       for (distance = 0; distance <= max_distance; distance++) {
-                               if (distance)
-                                       printk(",");
-                               printk("%ld", (long)migration_cost[distance] / 1000);
-                       }
-                       printk("\n");
+       if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
+               printk("migration_cost=");
+               for (distance = 0; distance <= max_distance; distance++) {
+                       if (distance)
+                               printk(",");
+                       printk("%ld", (long)migration_cost[distance] / 1000);
                }
+               printk("\n");
        }
        j1 = jiffies;
        if (migration_debug)
-               printk("migration: %ld seconds\n", (j1-j0)/HZ);
+               printk("migration: %ld seconds\n", (j1-j0) / HZ);
 
        /*
         * Move back to the original CPU. NUMA-Q gets confused