kernel/sched/cputime.c

   1 #include <linux/export.h>
   2 #include <linux/sched.h>
   3 #include <linux/tsacct_kern.h>
   4 #include <linux/kernel_stat.h>
   5 #include <linux/static_key.h>
   6 #include "sched.h"
   7
   8
   9 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
  10
  11 /*
  12  * There are no locks covering percpu hardirq/softirq time.
  13  * They are only modified in vtime_account, on corresponding CPU
  14  * with interrupts disabled. So, writes are safe.
  15  * They are read and saved off onto struct rq in update_rq_clock().
  16  * This may result in other CPU reading this CPU's irq time and can
  17  * race with irq/vtime_account on this CPU. We would either get old
  18  * or new value with a side effect of accounting a slice of irq time to wrong
  19  * task when irq is in progress while we read rq->clock. That is a worthy
  20  * compromise in place of having locks on each irq in account_system_time.
  21  */
  22 DEFINE_PER_CPU(u64, cpu_hardirq_time);
  23 DEFINE_PER_CPU(u64, cpu_softirq_time);
  24
  25 static DEFINE_PER_CPU(u64, irq_start_time);
  26 static int sched_clock_irqtime;
  27
  28 void enable_sched_clock_irqtime(void)
  29 {
  30         sched_clock_irqtime = 1;
  31 }
  32
  33 void disable_sched_clock_irqtime(void)
  34 {
  35         sched_clock_irqtime = 0;
  36 }
  37
  38 #ifndef CONFIG_64BIT
  39 DEFINE_PER_CPU(seqcount_t, irq_time_seq);
  40 #endif /* CONFIG_64BIT */
  41
  42 /*
  43  * Called before incrementing preempt_count on {soft,}irq_enter
  44  * and before decrementing preempt_count on {soft,}irq_exit.
  45  */
  46 void irqtime_account_irq(struct task_struct *curr)
  47 {
  48         unsigned long flags;
  49         s64 delta;
  50         int cpu;
  51
  52         if (!sched_clock_irqtime)
  53                 return;
  54
  55         local_irq_save(flags);
  56
  57         cpu = smp_processor_id();
  58         delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
  59         __this_cpu_add(irq_start_time, delta);
  60
  61         irq_time_write_begin();
  62         /*
  63          * We do not account for softirq time from ksoftirqd here.
  64          * We want to continue accounting softirq time to ksoftirqd thread
  65          * in that case, so as not to confuse scheduler with a special task
  66          * that do not consume any time, but still wants to run.
  67          */
  68         if (hardirq_count())
  69                 __this_cpu_add(cpu_hardirq_time, delta);
  70         else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
  71                 __this_cpu_add(cpu_softirq_time, delta);
  72
  73         irq_time_write_end();
  74         local_irq_restore(flags);
  75 }
  76 EXPORT_SYMBOL_GPL(irqtime_account_irq);
  77
  78 static int irqtime_account_hi_update(void)
  79 {
  80         u64 *cpustat = kcpustat_this_cpu->cpustat;
  81         unsigned long flags;
  82         u64 latest_ns;
  83         int ret = 0;
  84
  85         local_irq_save(flags);
  86         latest_ns = this_cpu_read(cpu_hardirq_time);
  87         if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
  88                 ret = 1;
  89         local_irq_restore(flags);
  90         return ret;
  91 }
  92
  93 static int irqtime_account_si_update(void)
  94 {
  95         u64 *cpustat = kcpustat_this_cpu->cpustat;
  96         unsigned long flags;
  97         u64 latest_ns;
  98         int ret = 0;
  99
 100         local_irq_save(flags);
 101         latest_ns = this_cpu_read(cpu_softirq_time);
 102         if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
 103                 ret = 1;
 104         local_irq_restore(flags);
 105         return ret;
 106 }
 107
 108 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 109
 110 #define sched_clock_irqtime     (0)
 111
 112 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
 113
 114 static inline void task_group_account_field(struct task_struct *p, int index,
 115                                             u64 tmp)
 116 {
 117 #ifdef CONFIG_CGROUP_CPUACCT
 118         struct kernel_cpustat *kcpustat;
 119         struct cpuacct *ca;
 120 #endif
 121         /*
 122          * Since all updates are sure to touch the root cgroup, we
 123          * get ourselves ahead and touch it first. If the root cgroup
 124          * is the only cgroup, then nothing else should be necessary.
 125          *
 126          */
 127         __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
 128
 129 #ifdef CONFIG_CGROUP_CPUACCT
 130         if (unlikely(!cpuacct_subsys.active))
 131                 return;
 132
 133         rcu_read_lock();
 134         ca = task_ca(p);
 135         while (ca && (ca != &root_cpuacct)) {
 136                 kcpustat = this_cpu_ptr(ca->cpustat);
 137                 kcpustat->cpustat[index] += tmp;
 138                 ca = parent_ca(ca);
 139         }
 140         rcu_read_unlock();
 141 #endif
 142 }
 143
 144 /*
 145  * Account user cpu time to a process.
 146  * @p: the process that the cpu time gets accounted to
 147  * @cputime: the cpu time spent in user space since the last update
 148  * @cputime_scaled: cputime scaled by cpu frequency
 149  */
 150 void account_user_time(struct task_struct *p, cputime_t cputime,
 151                        cputime_t cputime_scaled)
 152 {
 153         int index;
 154
 155         /* Add user time to process. */
 156         p->utime += cputime;
 157         p->utimescaled += cputime_scaled;
 158         account_group_user_time(p, cputime);
 159
 160         index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 161
 162         /* Add user time to cpustat. */
 163         task_group_account_field(p, index, (__force u64) cputime);
 164
 165         /* Account for user time used */
 166         acct_update_integrals(p);
 167 }
 168
 169 /*
 170  * Account guest cpu time to a process.
 171  * @p: the process that the cpu time gets accounted to
 172  * @cputime: the cpu time spent in virtual machine since the last update
 173  * @cputime_scaled: cputime scaled by cpu frequency
 174  */
 175 static void account_guest_time(struct task_struct *p, cputime_t cputime,
 176                                cputime_t cputime_scaled)
 177 {
 178         u64 *cpustat = kcpustat_this_cpu->cpustat;
 179
 180         /* Add guest time to process. */
 181         p->utime += cputime;
 182         p->utimescaled += cputime_scaled;
 183         account_group_user_time(p, cputime);
 184         p->gtime += cputime;
 185
 186         /* Add guest time to cpustat. */
 187         if (TASK_NICE(p) > 0) {
 188                 cpustat[CPUTIME_NICE] += (__force u64) cputime;
 189                 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
 190         } else {
 191                 cpustat[CPUTIME_USER] += (__force u64) cputime;
 192                 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
 193         }
 194 }
 195
 196 /*
 197  * Account system cpu time to a process and desired cpustat field
 198  * @p: the process that the cpu time gets accounted to
 199  * @cputime: the cpu time spent in kernel space since the last update
 200  * @cputime_scaled: cputime scaled by cpu frequency
 201  * @target_cputime64: pointer to cpustat field that has to be updated
 202  */
 203 static inline
 204 void __account_system_time(struct task_struct *p, cputime_t cputime,
 205                         cputime_t cputime_scaled, int index)
 206 {
 207         /* Add system time to process. */
 208         p->stime += cputime;
 209         p->stimescaled += cputime_scaled;
 210         account_group_system_time(p, cputime);
 211
 212         /* Add system time to cpustat. */
 213         task_group_account_field(p, index, (__force u64) cputime);
 214
 215         /* Account for system time used */
 216         acct_update_integrals(p);
 217 }
 218
 219 /*
 220  * Account system cpu time to a process.
 221  * @p: the process that the cpu time gets accounted to
 222  * @hardirq_offset: the offset to subtract from hardirq_count()
 223  * @cputime: the cpu time spent in kernel space since the last update
 224  * @cputime_scaled: cputime scaled by cpu frequency
 225  */
 226 void account_system_time(struct task_struct *p, int hardirq_offset,
 227                          cputime_t cputime, cputime_t cputime_scaled)
 228 {
 229         int index;
 230
 231         if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
 232                 account_guest_time(p, cputime, cputime_scaled);
 233                 return;
 234         }
 235
 236         if (hardirq_count() - hardirq_offset)
 237                 index = CPUTIME_IRQ;
 238         else if (in_serving_softirq())
 239                 index = CPUTIME_SOFTIRQ;
 240         else
 241                 index = CPUTIME_SYSTEM;
 242
 243         __account_system_time(p, cputime, cputime_scaled, index);
 244 }
 245
 246 /*
 247  * Account for involuntary wait time.
 248  * @cputime: the cpu time spent in involuntary wait
 249  */
 250 void account_steal_time(cputime_t cputime)
 251 {
 252         u64 *cpustat = kcpustat_this_cpu->cpustat;
 253
 254         cpustat[CPUTIME_STEAL] += (__force u64) cputime;
 255 }
 256
 257 /*
 258  * Account for idle time.
 259  * @cputime: the cpu time spent in idle wait
 260  */
 261 void account_idle_time(cputime_t cputime)
 262 {
 263         u64 *cpustat = kcpustat_this_cpu->cpustat;
 264         struct rq *rq = this_rq();
 265
 266         if (atomic_read(&rq->nr_iowait) > 0)
 267                 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
 268         else
 269                 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
 270 }
 271
 272 static __always_inline bool steal_account_process_tick(void)
 273 {
 274 #ifdef CONFIG_PARAVIRT
 275         if (static_key_false(&paravirt_steal_enabled)) {
 276                 u64 steal, st = 0;
 277
 278                 steal = paravirt_steal_clock(smp_processor_id());
 279                 steal -= this_rq()->prev_steal_time;
 280
 281                 st = steal_ticks(steal);
 282                 this_rq()->prev_steal_time += st * TICK_NSEC;
 283
 284                 account_steal_time(st);
 285                 return st;
 286         }
 287 #endif
 288         return false;
 289 }
 290
 291 /*
 292  * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
 293  * tasks (sum on group iteration) belonging to @tsk's group.
 294  */
 295 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 296 {
 297         struct signal_struct *sig = tsk->signal;
 298         struct task_struct *t;
 299
 300         times->utime = sig->utime;
 301         times->stime = sig->stime;
 302         times->sum_exec_runtime = sig->sum_sched_runtime;
 303
 304         rcu_read_lock();
 305         /* make sure we can trust tsk->thread_group list */
 306         if (!likely(pid_alive(tsk)))
 307                 goto out;
 308
 309         t = tsk;
 310         do {
 311                 times->utime += t->utime;
 312                 times->stime += t->stime;
 313                 times->sum_exec_runtime += task_sched_runtime(t);
 314         } while_each_thread(tsk, t);
 315 out:
 316         rcu_read_unlock();
 317 }
 318
 319 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 320
 321 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 322 /*
 323  * Account a tick to a process and cpustat
 324  * @p: the process that the cpu time gets accounted to
 325  * @user_tick: is the tick from userspace
 326  * @rq: the pointer to rq
 327  *
 328  * Tick demultiplexing follows the order
 329  * - pending hardirq update
 330  * - pending softirq update
 331  * - user_time
 332  * - idle_time
 333  * - system time
 334  *   - check for guest_time
 335  *   - else account as system_time
 336  *
 337  * Check for hardirq is done both for system and user time as there is
 338  * no timer going off while we are on hardirq and hence we may never get an
 339  * opportunity to update it solely in system time.
 340  * p->stime and friends are only updated on system time and not on irq
 341  * softirq as those do not count in task exec_runtime any more.
 342  */
 343 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 344                                                 struct rq *rq)
 345 {
 346         cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
 347         u64 *cpustat = kcpustat_this_cpu->cpustat;
 348
 349         if (steal_account_process_tick())
 350                 return;
 351
 352         if (irqtime_account_hi_update()) {
 353                 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
 354         } else if (irqtime_account_si_update()) {
 355                 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
 356         } else if (this_cpu_ksoftirqd() == p) {
 357                 /*
 358                  * ksoftirqd time do not get accounted in cpu_softirq_time.
 359                  * So, we have to handle it separately here.
 360                  * Also, p->stime needs to be updated for ksoftirqd.
 361                  */
 362                 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
 363                                         CPUTIME_SOFTIRQ);
 364         } else if (user_tick) {
 365                 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
 366         } else if (p == rq->idle) {
 367                 account_idle_time(cputime_one_jiffy);
 368         } else if (p->flags & PF_VCPU) { /* System time or guest time */
 369                 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
 370         } else {
 371                 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
 372                                         CPUTIME_SYSTEM);
 373         }
 374 }
 375
 376 static void irqtime_account_idle_ticks(int ticks)
 377 {
 378         int i;
 379         struct rq *rq = this_rq();
 380
 381         for (i = 0; i < ticks; i++)
 382                 irqtime_account_process_tick(current, 0, rq);
 383 }
 384 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 385 static void irqtime_account_idle_ticks(int ticks) {}
 386 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 387                                                 struct rq *rq) {}
 388 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 389
 390 /*
 391  * Account a single tick of cpu time.
 392  * @p: the process that the cpu time gets accounted to
 393  * @user_tick: indicates if the tick is a user or a system tick
 394  */
 395 void account_process_tick(struct task_struct *p, int user_tick)
 396 {
 397         cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
 398         struct rq *rq = this_rq();
 399
 400         if (sched_clock_irqtime) {
 401                 irqtime_account_process_tick(p, user_tick, rq);
 402                 return;
 403         }
 404
 405         if (steal_account_process_tick())
 406                 return;
 407
 408         if (user_tick)
 409                 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
 410         else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
 411                 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
 412                                     one_jiffy_scaled);
 413         else
 414                 account_idle_time(cputime_one_jiffy);
 415 }
 416
 417 /*
 418  * Account multiple ticks of steal time.
 419  * @p: the process from which the cpu time has been stolen
 420  * @ticks: number of stolen ticks
 421  */
 422 void account_steal_ticks(unsigned long ticks)
 423 {
 424         account_steal_time(jiffies_to_cputime(ticks));
 425 }
 426
 427 /*
 428  * Account multiple ticks of idle time.
 429  * @ticks: number of stolen ticks
 430  */
 431 void account_idle_ticks(unsigned long ticks)
 432 {
 433
 434         if (sched_clock_irqtime) {
 435                 irqtime_account_idle_ticks(ticks);
 436                 return;
 437         }
 438
 439         account_idle_time(jiffies_to_cputime(ticks));
 440 }
 441
 442 #endif
 443
 444 /*
 445  * Use precise platform statistics if available:
 446  */
 447 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 448 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 449 {
 450         *ut = p->utime;
 451         *st = p->stime;
 452 }
 453
 454 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 455 {
 456         struct task_cputime cputime;
 457
 458         thread_group_cputime(p, &cputime);
 459
 460         *ut = cputime.utime;
 461         *st = cputime.stime;
 462 }
 463
 464 void vtime_account_system(struct task_struct *tsk)
 465 {
 466         unsigned long flags;
 467
 468         local_irq_save(flags);
 469         __vtime_account_system(tsk);
 470         local_irq_restore(flags);
 471 }
 472 EXPORT_SYMBOL_GPL(vtime_account_system);
 473
 474 /*
 475  * Archs that account the whole time spent in the idle task
 476  * (outside irq) as idle time can rely on this and just implement
 477  * __vtime_account_system() and __vtime_account_idle(). Archs that
 478  * have other meaning of the idle time (s390 only includes the
 479  * time spent by the CPU when it's in low power mode) must override
 480  * vtime_account().
 481  */
 482 #ifndef __ARCH_HAS_VTIME_ACCOUNT
 483 void vtime_account(struct task_struct *tsk)
 484 {
 485         unsigned long flags;
 486
 487         local_irq_save(flags);
 488
 489         if (in_interrupt() || !is_idle_task(tsk))
 490                 __vtime_account_system(tsk);
 491         else
 492                 __vtime_account_idle(tsk);
 493
 494         local_irq_restore(flags);
 495 }
 496 EXPORT_SYMBOL_GPL(vtime_account);
 497 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
 498
 499 #else
 500
 501 #ifndef nsecs_to_cputime
 502 # define nsecs_to_cputime(__nsecs)      nsecs_to_jiffies(__nsecs)
 503 #endif
 504
 505 static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
 506 {
 507         u64 temp = (__force u64) rtime;
 508
 509         temp *= (__force u64) utime;
 510
 511         if (sizeof(cputime_t) == 4)
 512                 temp = div_u64(temp, (__force u32) total);
 513         else
 514                 temp = div64_u64(temp, (__force u64) total);
 515
 516         return (__force cputime_t) temp;
 517 }
 518
 519 /*
 520  * Adjust tick based cputime random precision against scheduler
 521  * runtime accounting.
 522  */
 523 static void cputime_adjust(struct task_cputime *curr,
 524                            struct cputime *prev,
 525                            cputime_t *ut, cputime_t *st)
 526 {
 527         cputime_t rtime, utime, total;
 528
 529         utime = curr->utime;
 530         total = utime + curr->stime;
 531
 532         /*
 533          * Tick based cputime accounting depend on random scheduling
 534          * timeslices of a task to be interrupted or not by the timer.
 535          * Depending on these circumstances, the number of these interrupts
 536          * may be over or under-optimistic, matching the real user and system
 537          * cputime with a variable precision.
 538          *
 539          * Fix this by scaling these tick based values against the total
 540          * runtime accounted by the CFS scheduler.
 541          */
 542         rtime = nsecs_to_cputime(curr->sum_exec_runtime);
 543
 544         if (total)
 545                 utime = scale_utime(utime, rtime, total);
 546         else
 547                 utime = rtime;
 548
 549         /*
 550          * If the tick based count grows faster than the scheduler one,
 551          * the result of the scaling may go backward.
 552          * Let's enforce monotonicity.
 553          */
 554         prev->utime = max(prev->utime, utime);
 555         prev->stime = max(prev->stime, rtime - prev->utime);
 556
 557         *ut = prev->utime;
 558         *st = prev->stime;
 559 }
 560
 561 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 562 {
 563         struct task_cputime cputime = {
 564                 .utime = p->utime,
 565                 .stime = p->stime,
 566                 .sum_exec_runtime = p->se.sum_exec_runtime,
 567         };
 568
 569         cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 570 }
 571
 572 /*
 573  * Must be called with siglock held.
 574  */
 575 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 576 {
 577         struct task_cputime cputime;
 578
 579         thread_group_cputime(p, &cputime);
 580         cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
 581 }
 582 #endif