kernel/sched/cputime.c

   1 #include <linux/export.h>
   2 #include <linux/sched.h>
   3 #include <linux/tsacct_kern.h>
   4 #include <linux/kernel_stat.h>
   5 #include <linux/static_key.h>
   6 #include <linux/context_tracking.h>
   7 #include <linux/sched/cputime.h>
   8 #include "sched.h"
   9
  10 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
  11
  12 /*
  13  * There are no locks covering percpu hardirq/softirq time.
  14  * They are only modified in vtime_account, on corresponding CPU
  15  * with interrupts disabled. So, writes are safe.
  16  * They are read and saved off onto struct rq in update_rq_clock().
  17  * This may result in other CPU reading this CPU's irq time and can
  18  * race with irq/vtime_account on this CPU. We would either get old
  19  * or new value with a side effect of accounting a slice of irq time to wrong
  20  * task when irq is in progress while we read rq->clock. That is a worthy
  21  * compromise in place of having locks on each irq in account_system_time.
  22  */
  23 DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
  24
  25 static int sched_clock_irqtime;
  26
  27 void enable_sched_clock_irqtime(void)
  28 {
  29         sched_clock_irqtime = 1;
  30 }
  31
  32 void disable_sched_clock_irqtime(void)
  33 {
  34         sched_clock_irqtime = 0;
  35 }
  36
  37 /*
  38  * Called before incrementing preempt_count on {soft,}irq_enter
  39  * and before decrementing preempt_count on {soft,}irq_exit.
  40  */
  41 void irqtime_account_irq(struct task_struct *curr)
  42 {
  43         struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
  44         u64 *cpustat = kcpustat_this_cpu->cpustat;
  45         s64 delta;
  46         int cpu;
  47
  48         if (!sched_clock_irqtime)
  49                 return;
  50
  51         cpu = smp_processor_id();
  52         delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
  53         irqtime->irq_start_time += delta;
  54
  55         u64_stats_update_begin(&irqtime->sync);
  56         /*
  57          * We do not account for softirq time from ksoftirqd here.
  58          * We want to continue accounting softirq time to ksoftirqd thread
  59          * in that case, so as not to confuse scheduler with a special task
  60          * that do not consume any time, but still wants to run.
  61          */
  62         if (hardirq_count()) {
  63                 cpustat[CPUTIME_IRQ] += delta;
  64                 irqtime->tick_delta += delta;
  65         } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) {
  66                 cpustat[CPUTIME_SOFTIRQ] += delta;
  67                 irqtime->tick_delta += delta;
  68         }
  69
  70         u64_stats_update_end(&irqtime->sync);
  71 }
  72 EXPORT_SYMBOL_GPL(irqtime_account_irq);
  73
  74 static u64 irqtime_tick_accounted(u64 maxtime)
  75 {
  76         struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
  77         u64 delta;
  78
  79         delta = min(irqtime->tick_delta, maxtime);
  80         irqtime->tick_delta -= delta;
  81
  82         return delta;
  83 }
  84
  85 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
  86
  87 #define sched_clock_irqtime     (0)
  88
  89 static u64 irqtime_tick_accounted(u64 dummy)
  90 {
  91         return 0;
  92 }
  93
  94 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
  95
  96 static inline void task_group_account_field(struct task_struct *p, int index,
  97                                             u64 tmp)
  98 {
  99         /*
 100          * Since all updates are sure to touch the root cgroup, we
 101          * get ourselves ahead and touch it first. If the root cgroup
 102          * is the only cgroup, then nothing else should be necessary.
 103          *
 104          */
 105         __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
 106
 107         cpuacct_account_field(p, index, tmp);
 108 }
 109
 110 /*
 111  * Account user cpu time to a process.
 112  * @p: the process that the cpu time gets accounted to
 113  * @cputime: the cpu time spent in user space since the last update
 114  */
 115 void account_user_time(struct task_struct *p, u64 cputime)
 116 {
 117         int index;
 118
 119         /* Add user time to process. */
 120         p->utime += cputime;
 121         account_group_user_time(p, cputime);
 122
 123         index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 124
 125         /* Add user time to cpustat. */
 126         task_group_account_field(p, index, cputime);
 127
 128         /* Account for user time used */
 129         acct_account_cputime(p);
 130 }
 131
 132 /*
 133  * Account guest cpu time to a process.
 134  * @p: the process that the cpu time gets accounted to
 135  * @cputime: the cpu time spent in virtual machine since the last update
 136  */
 137 void account_guest_time(struct task_struct *p, u64 cputime)
 138 {
 139         u64 *cpustat = kcpustat_this_cpu->cpustat;
 140
 141         /* Add guest time to process. */
 142         p->utime += cputime;
 143         account_group_user_time(p, cputime);
 144         p->gtime += cputime;
 145
 146         /* Add guest time to cpustat. */
 147         if (task_nice(p) > 0) {
 148                 cpustat[CPUTIME_NICE] += cputime;
 149                 cpustat[CPUTIME_GUEST_NICE] += cputime;
 150         } else {
 151                 cpustat[CPUTIME_USER] += cputime;
 152                 cpustat[CPUTIME_GUEST] += cputime;
 153         }
 154 }
 155
 156 /*
 157  * Account system cpu time to a process and desired cpustat field
 158  * @p: the process that the cpu time gets accounted to
 159  * @cputime: the cpu time spent in kernel space since the last update
 160  * @index: pointer to cpustat field that has to be updated
 161  */
 162 void account_system_index_time(struct task_struct *p,
 163                                u64 cputime, enum cpu_usage_stat index)
 164 {
 165         /* Add system time to process. */
 166         p->stime += cputime;
 167         account_group_system_time(p, cputime);
 168
 169         /* Add system time to cpustat. */
 170         task_group_account_field(p, index, cputime);
 171
 172         /* Account for system time used */
 173         acct_account_cputime(p);
 174 }
 175
 176 /*
 177  * Account system cpu time to a process.
 178  * @p: the process that the cpu time gets accounted to
 179  * @hardirq_offset: the offset to subtract from hardirq_count()
 180  * @cputime: the cpu time spent in kernel space since the last update
 181  */
 182 void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
 183 {
 184         int index;
 185
 186         if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
 187                 account_guest_time(p, cputime);
 188                 return;
 189         }
 190
 191         if (hardirq_count() - hardirq_offset)
 192                 index = CPUTIME_IRQ;
 193         else if (in_serving_softirq())
 194                 index = CPUTIME_SOFTIRQ;
 195         else
 196                 index = CPUTIME_SYSTEM;
 197
 198         account_system_index_time(p, cputime, index);
 199 }
 200
 201 /*
 202  * Account for involuntary wait time.
 203  * @cputime: the cpu time spent in involuntary wait
 204  */
 205 void account_steal_time(u64 cputime)
 206 {
 207         u64 *cpustat = kcpustat_this_cpu->cpustat;
 208
 209         cpustat[CPUTIME_STEAL] += cputime;
 210 }
 211
 212 /*
 213  * Account for idle time.
 214  * @cputime: the cpu time spent in idle wait
 215  */
 216 void account_idle_time(u64 cputime)
 217 {
 218         u64 *cpustat = kcpustat_this_cpu->cpustat;
 219         struct rq *rq = this_rq();
 220
 221         if (atomic_read(&rq->nr_iowait) > 0)
 222                 cpustat[CPUTIME_IOWAIT] += cputime;
 223         else
 224                 cpustat[CPUTIME_IDLE] += cputime;
 225 }
 226
 227 /*
 228  * When a guest is interrupted for a longer amount of time, missed clock
 229  * ticks are not redelivered later. Due to that, this function may on
 230  * occasion account more time than the calling functions think elapsed.
 231  */
 232 static __always_inline u64 steal_account_process_time(u64 maxtime)
 233 {
 234 #ifdef CONFIG_PARAVIRT
 235         if (static_key_false(&paravirt_steal_enabled)) {
 236                 u64 steal;
 237
 238                 steal = paravirt_steal_clock(smp_processor_id());
 239                 steal -= this_rq()->prev_steal_time;
 240                 steal = min(steal, maxtime);
 241                 account_steal_time(steal);
 242                 this_rq()->prev_steal_time += steal;
 243
 244                 return steal;
 245         }
 246 #endif
 247         return 0;
 248 }
 249
 250 /*
 251  * Account how much elapsed time was spent in steal, irq, or softirq time.
 252  */
 253 static inline u64 account_other_time(u64 max)
 254 {
 255         u64 accounted;
 256
 257         /* Shall be converted to a lockdep-enabled lightweight check */
 258         WARN_ON_ONCE(!irqs_disabled());
 259
 260         accounted = steal_account_process_time(max);
 261
 262         if (accounted < max)
 263                 accounted += irqtime_tick_accounted(max - accounted);
 264
 265         return accounted;
 266 }
 267
 268 #ifdef CONFIG_64BIT
 269 static inline u64 read_sum_exec_runtime(struct task_struct *t)
 270 {
 271         return t->se.sum_exec_runtime;
 272 }
 273 #else
 274 static u64 read_sum_exec_runtime(struct task_struct *t)
 275 {
 276         u64 ns;
 277         struct rq_flags rf;
 278         struct rq *rq;
 279
 280         rq = task_rq_lock(t, &rf);
 281         ns = t->se.sum_exec_runtime;
 282         task_rq_unlock(rq, t, &rf);
 283
 284         return ns;
 285 }
 286 #endif
 287
 288 /*
 289  * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
 290  * tasks (sum on group iteration) belonging to @tsk's group.
 291  */
 292 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 293 {
 294         struct signal_struct *sig = tsk->signal;
 295         u64 utime, stime;
 296         struct task_struct *t;
 297         unsigned int seq, nextseq;
 298         unsigned long flags;
 299
 300         /*
 301          * Update current task runtime to account pending time since last
 302          * scheduler action or thread_group_cputime() call. This thread group
 303          * might have other running tasks on different CPUs, but updating
 304          * their runtime can affect syscall performance, so we skip account
 305          * those pending times and rely only on values updated on tick or
 306          * other scheduler action.
 307          */
 308         if (same_thread_group(current, tsk))
 309                 (void) task_sched_runtime(current);
 310
 311         rcu_read_lock();
 312         /* Attempt a lockless read on the first round. */
 313         nextseq = 0;
 314         do {
 315                 seq = nextseq;
 316                 flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
 317                 times->utime = sig->utime;
 318                 times->stime = sig->stime;
 319                 times->sum_exec_runtime = sig->sum_sched_runtime;
 320
 321                 for_each_thread(tsk, t) {
 322                         task_cputime(t, &utime, &stime);
 323                         times->utime += utime;
 324                         times->stime += stime;
 325                         times->sum_exec_runtime += read_sum_exec_runtime(t);
 326                 }
 327                 /* If lockless access failed, take the lock. */
 328                 nextseq = 1;
 329         } while (need_seqretry(&sig->stats_lock, seq));
 330         done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
 331         rcu_read_unlock();
 332 }
 333
 334 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 335 /*
 336  * Account a tick to a process and cpustat
 337  * @p: the process that the cpu time gets accounted to
 338  * @user_tick: is the tick from userspace
 339  * @rq: the pointer to rq
 340  *
 341  * Tick demultiplexing follows the order
 342  * - pending hardirq update
 343  * - pending softirq update
 344  * - user_time
 345  * - idle_time
 346  * - system time
 347  *   - check for guest_time
 348  *   - else account as system_time
 349  *
 350  * Check for hardirq is done both for system and user time as there is
 351  * no timer going off while we are on hardirq and hence we may never get an
 352  * opportunity to update it solely in system time.
 353  * p->stime and friends are only updated on system time and not on irq
 354  * softirq as those do not count in task exec_runtime any more.
 355  */
 356 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 357                                          struct rq *rq, int ticks)
 358 {
 359         u64 other, cputime = TICK_NSEC * ticks;
 360
 361         /*
 362          * When returning from idle, many ticks can get accounted at
 363          * once, including some ticks of steal, irq, and softirq time.
 364          * Subtract those ticks from the amount of time accounted to
 365          * idle, or potentially user or system time. Due to rounding,
 366          * other time can exceed ticks occasionally.
 367          */
 368         other = account_other_time(ULONG_MAX);
 369         if (other >= cputime)
 370                 return;
 371
 372         cputime -= other;
 373
 374         if (this_cpu_ksoftirqd() == p) {
 375                 /*
 376                  * ksoftirqd time do not get accounted in cpu_softirq_time.
 377                  * So, we have to handle it separately here.
 378                  * Also, p->stime needs to be updated for ksoftirqd.
 379                  */
 380                 account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
 381         } else if (user_tick) {
 382                 account_user_time(p, cputime);
 383         } else if (p == rq->idle) {
 384                 account_idle_time(cputime);
 385         } else if (p->flags & PF_VCPU) { /* System time or guest time */
 386                 account_guest_time(p, cputime);
 387         } else {
 388                 account_system_index_time(p, cputime, CPUTIME_SYSTEM);
 389         }
 390 }
 391
 392 static void irqtime_account_idle_ticks(int ticks)
 393 {
 394         struct rq *rq = this_rq();
 395
 396         irqtime_account_process_tick(current, 0, rq, ticks);
 397 }
 398 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 399 static inline void irqtime_account_idle_ticks(int ticks) {}
 400 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 401                                                 struct rq *rq, int nr_ticks) {}
 402 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 403
 404 /*
 405  * Use precise platform statistics if available:
 406  */
 407 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 408
 409 #ifndef __ARCH_HAS_VTIME_TASK_SWITCH
 410 void vtime_common_task_switch(struct task_struct *prev)
 411 {
 412         if (is_idle_task(prev))
 413                 vtime_account_idle(prev);
 414         else
 415                 vtime_account_system(prev);
 416
 417         vtime_flush(prev);
 418         arch_vtime_task_switch(prev);
 419 }
 420 #endif
 421
 422 #endif /* CONFIG_VIRT_CPU_ACCOUNTING */
 423
 424
 425 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 426 /*
 427  * Archs that account the whole time spent in the idle task
 428  * (outside irq) as idle time can rely on this and just implement
 429  * vtime_account_system() and vtime_account_idle(). Archs that
 430  * have other meaning of the idle time (s390 only includes the
 431  * time spent by the CPU when it's in low power mode) must override
 432  * vtime_account().
 433  */
 434 #ifndef __ARCH_HAS_VTIME_ACCOUNT
 435 void vtime_account_irq_enter(struct task_struct *tsk)
 436 {
 437         if (!in_interrupt() && is_idle_task(tsk))
 438                 vtime_account_idle(tsk);
 439         else
 440                 vtime_account_system(tsk);
 441 }
 442 EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 443 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
 444
 445 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 446 {
 447         *ut = p->utime;
 448         *st = p->stime;
 449 }
 450 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 451
 452 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 453 {
 454         struct task_cputime cputime;
 455
 456         thread_group_cputime(p, &cputime);
 457
 458         *ut = cputime.utime;
 459         *st = cputime.stime;
 460 }
 461 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 462 /*
 463  * Account a single tick of cpu time.
 464  * @p: the process that the cpu time gets accounted to
 465  * @user_tick: indicates if the tick is a user or a system tick
 466  */
 467 void account_process_tick(struct task_struct *p, int user_tick)
 468 {
 469         u64 cputime, steal;
 470         struct rq *rq = this_rq();
 471
 472         if (vtime_accounting_cpu_enabled())
 473                 return;
 474
 475         if (sched_clock_irqtime) {
 476                 irqtime_account_process_tick(p, user_tick, rq, 1);
 477                 return;
 478         }
 479
 480         cputime = TICK_NSEC;
 481         steal = steal_account_process_time(ULONG_MAX);
 482
 483         if (steal >= cputime)
 484                 return;
 485
 486         cputime -= steal;
 487
 488         if (user_tick)
 489                 account_user_time(p, cputime);
 490         else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
 491                 account_system_time(p, HARDIRQ_OFFSET, cputime);
 492         else
 493                 account_idle_time(cputime);
 494 }
 495
 496 /*
 497  * Account multiple ticks of idle time.
 498  * @ticks: number of stolen ticks
 499  */
 500 void account_idle_ticks(unsigned long ticks)
 501 {
 502         u64 cputime, steal;
 503
 504         if (sched_clock_irqtime) {
 505                 irqtime_account_idle_ticks(ticks);
 506                 return;
 507         }
 508
 509         cputime = ticks * TICK_NSEC;
 510         steal = steal_account_process_time(ULONG_MAX);
 511
 512         if (steal >= cputime)
 513                 return;
 514
 515         cputime -= steal;
 516         account_idle_time(cputime);
 517 }
 518
 519 /*
 520  * Perform (stime * rtime) / total, but avoid multiplication overflow by
 521  * loosing precision when the numbers are big.
 522  */
 523 static u64 scale_stime(u64 stime, u64 rtime, u64 total)
 524 {
 525         u64 scaled;
 526
 527         for (;;) {
 528                 /* Make sure "rtime" is the bigger of stime/rtime */
 529                 if (stime > rtime)
 530                         swap(rtime, stime);
 531
 532                 /* Make sure 'total' fits in 32 bits */
 533                 if (total >> 32)
 534                         goto drop_precision;
 535
 536                 /* Does rtime (and thus stime) fit in 32 bits? */
 537                 if (!(rtime >> 32))
 538                         break;
 539
 540                 /* Can we just balance rtime/stime rather than dropping bits? */
 541                 if (stime >> 31)
 542                         goto drop_precision;
 543
 544                 /* We can grow stime and shrink rtime and try to make them both fit */
 545                 stime <<= 1;
 546                 rtime >>= 1;
 547                 continue;
 548
 549 drop_precision:
 550                 /* We drop from rtime, it has more bits than stime */
 551                 rtime >>= 1;
 552                 total >>= 1;
 553         }
 554
 555         /*
 556          * Make sure gcc understands that this is a 32x32->64 multiply,
 557          * followed by a 64/32->64 divide.
 558          */
 559         scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
 560         return scaled;
 561 }
 562
 563 /*
 564  * Adjust tick based cputime random precision against scheduler runtime
 565  * accounting.
 566  *
 567  * Tick based cputime accounting depend on random scheduling timeslices of a
 568  * task to be interrupted or not by the timer.  Depending on these
 569  * circumstances, the number of these interrupts may be over or
 570  * under-optimistic, matching the real user and system cputime with a variable
 571  * precision.
 572  *
 573  * Fix this by scaling these tick based values against the total runtime
 574  * accounted by the CFS scheduler.
 575  *
 576  * This code provides the following guarantees:
 577  *
 578  *   stime + utime == rtime
 579  *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
 580  *
 581  * Assuming that rtime_i+1 >= rtime_i.
 582  */
 583 static void cputime_adjust(struct task_cputime *curr,
 584                            struct prev_cputime *prev,
 585                            u64 *ut, u64 *st)
 586 {
 587         u64 rtime, stime, utime;
 588         unsigned long flags;
 589
 590         /* Serialize concurrent callers such that we can honour our guarantees */
 591         raw_spin_lock_irqsave(&prev->lock, flags);
 592         rtime = curr->sum_exec_runtime;
 593
 594         /*
 595          * This is possible under two circumstances:
 596          *  - rtime isn't monotonic after all (a bug);
 597          *  - we got reordered by the lock.
 598          *
 599          * In both cases this acts as a filter such that the rest of the code
 600          * can assume it is monotonic regardless of anything else.
 601          */
 602         if (prev->stime + prev->utime >= rtime)
 603                 goto out;
 604
 605         stime = curr->stime;
 606         utime = curr->utime;
 607
 608         /*
 609          * If either stime or both stime and utime are 0, assume all runtime is
 610          * userspace. Once a task gets some ticks, the monotonicy code at
 611          * 'update' will ensure things converge to the observed ratio.
 612          */
 613         if (stime == 0) {
 614                 utime = rtime;
 615                 goto update;
 616         }
 617
 618         if (utime == 0) {
 619                 stime = rtime;
 620                 goto update;
 621         }
 622
 623         stime = scale_stime(stime, rtime, stime + utime);
 624
 625 update:
 626         /*
 627          * Make sure stime doesn't go backwards; this preserves monotonicity
 628          * for utime because rtime is monotonic.
 629          *
 630          *  utime_i+1 = rtime_i+1 - stime_i
 631          *            = rtime_i+1 - (rtime_i - utime_i)
 632          *            = (rtime_i+1 - rtime_i) + utime_i
 633          *            >= utime_i
 634          */
 635         if (stime < prev->stime)
 636                 stime = prev->stime;
 637         utime = rtime - stime;
 638
 639         /*
 640          * Make sure utime doesn't go backwards; this still preserves
 641          * monotonicity for stime, analogous argument to above.
 642          */
 643         if (utime < prev->utime) {
 644                 utime = prev->utime;
 645                 stime = rtime - utime;
 646         }
 647
 648         prev->stime = stime;
 649         prev->utime = utime;
 650 out:
 651         *ut = prev->utime;
 652         *st = prev->stime;
 653         raw_spin_unlock_irqrestore(&prev->lock, flags);
 654 }
 655
 656 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 657 {
 658         struct task_cputime cputime = {
 659                 .sum_exec_runtime = p->se.sum_exec_runtime,
 660         };
 661
 662         task_cputime(p, &cputime.utime, &cputime.stime);
 663         cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 664 }
 665 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 666
 667 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 668 {
 669         struct task_cputime cputime;
 670
 671         thread_group_cputime(p, &cputime);
 672         cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
 673 }
 674 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 675
 676 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 677 static u64 vtime_delta(struct task_struct *tsk)
 678 {
 679         unsigned long now = READ_ONCE(jiffies);
 680
 681         if (time_before(now, (unsigned long)tsk->vtime_snap))
 682                 return 0;
 683
 684         return jiffies_to_nsecs(now - tsk->vtime_snap);
 685 }
 686
 687 static u64 get_vtime_delta(struct task_struct *tsk)
 688 {
 689         unsigned long now = READ_ONCE(jiffies);
 690         u64 delta, other;
 691
 692         /*
 693          * Unlike tick based timing, vtime based timing never has lost
 694          * ticks, and no need for steal time accounting to make up for
 695          * lost ticks. Vtime accounts a rounded version of actual
 696          * elapsed time. Limit account_other_time to prevent rounding
 697          * errors from causing elapsed vtime to go negative.
 698          */
 699         delta = jiffies_to_nsecs(now - tsk->vtime_snap);
 700         other = account_other_time(delta);
 701         WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
 702         tsk->vtime_snap = now;
 703
 704         return delta - other;
 705 }
 706
 707 static void __vtime_account_system(struct task_struct *tsk)
 708 {
 709         account_system_time(tsk, irq_count(), get_vtime_delta(tsk));
 710 }
 711
 712 void vtime_account_system(struct task_struct *tsk)
 713 {
 714         if (!vtime_delta(tsk))
 715                 return;
 716
 717         write_seqcount_begin(&tsk->vtime_seqcount);
 718         __vtime_account_system(tsk);
 719         write_seqcount_end(&tsk->vtime_seqcount);
 720 }
 721
 722 void vtime_account_user(struct task_struct *tsk)
 723 {
 724         write_seqcount_begin(&tsk->vtime_seqcount);
 725         tsk->vtime_snap_whence = VTIME_SYS;
 726         if (vtime_delta(tsk))
 727                 account_user_time(tsk, get_vtime_delta(tsk));
 728         write_seqcount_end(&tsk->vtime_seqcount);
 729 }
 730
 731 void vtime_user_enter(struct task_struct *tsk)
 732 {
 733         write_seqcount_begin(&tsk->vtime_seqcount);
 734         if (vtime_delta(tsk))
 735                 __vtime_account_system(tsk);
 736         tsk->vtime_snap_whence = VTIME_USER;
 737         write_seqcount_end(&tsk->vtime_seqcount);
 738 }
 739
 740 void vtime_guest_enter(struct task_struct *tsk)
 741 {
 742         /*
 743          * The flags must be updated under the lock with
 744          * the vtime_snap flush and update.
 745          * That enforces a right ordering and update sequence
 746          * synchronization against the reader (task_gtime())
 747          * that can thus safely catch up with a tickless delta.
 748          */
 749         write_seqcount_begin(&tsk->vtime_seqcount);
 750         if (vtime_delta(tsk))
 751                 __vtime_account_system(tsk);
 752         current->flags |= PF_VCPU;
 753         write_seqcount_end(&tsk->vtime_seqcount);
 754 }
 755 EXPORT_SYMBOL_GPL(vtime_guest_enter);
 756
 757 void vtime_guest_exit(struct task_struct *tsk)
 758 {
 759         write_seqcount_begin(&tsk->vtime_seqcount);
 760         __vtime_account_system(tsk);
 761         current->flags &= ~PF_VCPU;
 762         write_seqcount_end(&tsk->vtime_seqcount);
 763 }
 764 EXPORT_SYMBOL_GPL(vtime_guest_exit);
 765
 766 void vtime_account_idle(struct task_struct *tsk)
 767 {
 768         account_idle_time(get_vtime_delta(tsk));
 769 }
 770
 771 void arch_vtime_task_switch(struct task_struct *prev)
 772 {
 773         write_seqcount_begin(&prev->vtime_seqcount);
 774         prev->vtime_snap_whence = VTIME_INACTIVE;
 775         write_seqcount_end(&prev->vtime_seqcount);
 776
 777         write_seqcount_begin(&current->vtime_seqcount);
 778         current->vtime_snap_whence = VTIME_SYS;
 779         current->vtime_snap = jiffies;
 780         write_seqcount_end(&current->vtime_seqcount);
 781 }
 782
 783 void vtime_init_idle(struct task_struct *t, int cpu)
 784 {
 785         unsigned long flags;
 786
 787         local_irq_save(flags);
 788         write_seqcount_begin(&t->vtime_seqcount);
 789         t->vtime_snap_whence = VTIME_SYS;
 790         t->vtime_snap = jiffies;
 791         write_seqcount_end(&t->vtime_seqcount);
 792         local_irq_restore(flags);
 793 }
 794
 795 u64 task_gtime(struct task_struct *t)
 796 {
 797         unsigned int seq;
 798         u64 gtime;
 799
 800         if (!vtime_accounting_enabled())
 801                 return t->gtime;
 802
 803         do {
 804                 seq = read_seqcount_begin(&t->vtime_seqcount);
 805
 806                 gtime = t->gtime;
 807                 if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
 808                         gtime += vtime_delta(t);
 809
 810         } while (read_seqcount_retry(&t->vtime_seqcount, seq));
 811
 812         return gtime;
 813 }
 814
 815 /*
 816  * Fetch cputime raw values from fields of task_struct and
 817  * add up the pending nohz execution time since the last
 818  * cputime snapshot.
 819  */
 820 void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 821 {
 822         u64 delta;
 823         unsigned int seq;
 824
 825         if (!vtime_accounting_enabled()) {
 826                 *utime = t->utime;
 827                 *stime = t->stime;
 828                 return;
 829         }
 830
 831         do {
 832                 seq = read_seqcount_begin(&t->vtime_seqcount);
 833
 834                 *utime = t->utime;
 835                 *stime = t->stime;
 836
 837                 /* Task is sleeping, nothing to add */
 838                 if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
 839                         continue;
 840
 841                 delta = vtime_delta(t);
 842
 843                 /*
 844                  * Task runs either in user or kernel space, add pending nohz time to
 845                  * the right place.
 846                  */
 847                 if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU)
 848                         *utime += delta;
 849                 else if (t->vtime_snap_whence == VTIME_SYS)
 850                         *stime += delta;
 851         } while (read_seqcount_retry(&t->vtime_seqcount, seq));
 852 }
 853 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */