]> git.karo-electronics.de Git - karo-tx-linux.git/blob - drivers/staging/lustre/lustre/libcfs/tracefile.c
bc5d0eec70ebb5a996fb32c9775284fbd3913800
[karo-tx-linux.git] / drivers / staging / lustre / lustre / libcfs / tracefile.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2012, Intel Corporation.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * libcfs/libcfs/tracefile.c
37  *
38  * Author: Zach Brown <zab@clusterfs.com>
39  * Author: Phil Schwan <phil@clusterfs.com>
40  */
41
42 #define DEBUG_SUBSYSTEM S_LNET
43 #define LUSTRE_TRACEFILE_PRIVATE
44 #include "tracefile.h"
45
46 #include "../../include/linux/libcfs/libcfs.h"
47
48 /* XXX move things up to the top, comment */
49 union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS] __cacheline_aligned;
50
51 char cfs_tracefile[TRACEFILE_NAME_SIZE];
52 long long cfs_tracefile_size = CFS_TRACEFILE_SIZE;
53 static struct tracefiled_ctl trace_tctl;
54 static DEFINE_MUTEX(cfs_trace_thread_mutex);
55 static int thread_running;
56
57 static atomic_t cfs_tage_allocated = ATOMIC_INIT(0);
58
59 static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
60                                          struct cfs_trace_cpu_data *tcd);
61
62 static inline struct cfs_trace_page *
63 cfs_tage_from_list(struct list_head *list)
64 {
65         return list_entry(list, struct cfs_trace_page, linkage);
66 }
67
68 static struct cfs_trace_page *cfs_tage_alloc(gfp_t gfp)
69 {
70         struct page         *page;
71         struct cfs_trace_page *tage;
72
73         /* My caller is trying to free memory */
74         if (!in_interrupt() && memory_pressure_get())
75                 return NULL;
76
77         /*
78          * Don't spam console with allocation failures: they will be reported
79          * by upper layer anyway.
80          */
81         gfp |= __GFP_NOWARN;
82         page = alloc_page(gfp);
83         if (page == NULL)
84                 return NULL;
85
86         tage = kmalloc(sizeof(*tage), gfp);
87         if (tage == NULL) {
88                 __free_page(page);
89                 return NULL;
90         }
91
92         tage->page = page;
93         atomic_inc(&cfs_tage_allocated);
94         return tage;
95 }
96
97 static void cfs_tage_free(struct cfs_trace_page *tage)
98 {
99         __LASSERT(tage != NULL);
100         __LASSERT(tage->page != NULL);
101
102         __free_page(tage->page);
103         kfree(tage);
104         atomic_dec(&cfs_tage_allocated);
105 }
106
107 static void cfs_tage_to_tail(struct cfs_trace_page *tage,
108                              struct list_head *queue)
109 {
110         __LASSERT(tage != NULL);
111         __LASSERT(queue != NULL);
112
113         list_move_tail(&tage->linkage, queue);
114 }
115
116 int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, gfp_t gfp,
117                            struct list_head *stock)
118 {
119         int i;
120
121         /*
122          * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
123          * from here: this will lead to infinite recursion.
124          */
125
126         for (i = 0; i + tcd->tcd_cur_stock_pages < TCD_STOCK_PAGES ; ++i) {
127                 struct cfs_trace_page *tage;
128
129                 tage = cfs_tage_alloc(gfp);
130                 if (tage == NULL)
131                         break;
132                 list_add_tail(&tage->linkage, stock);
133         }
134         return i;
135 }
136
137 /* return a page that has 'len' bytes left at the end */
138 static struct cfs_trace_page *
139 cfs_trace_get_tage_try(struct cfs_trace_cpu_data *tcd, unsigned long len)
140 {
141         struct cfs_trace_page *tage;
142
143         if (tcd->tcd_cur_pages > 0) {
144                 __LASSERT(!list_empty(&tcd->tcd_pages));
145                 tage = cfs_tage_from_list(tcd->tcd_pages.prev);
146                 if (tage->used + len <= PAGE_CACHE_SIZE)
147                         return tage;
148         }
149
150         if (tcd->tcd_cur_pages < tcd->tcd_max_pages) {
151                 if (tcd->tcd_cur_stock_pages > 0) {
152                         tage = cfs_tage_from_list(tcd->tcd_stock_pages.prev);
153                         --tcd->tcd_cur_stock_pages;
154                         list_del_init(&tage->linkage);
155                 } else {
156                         tage = cfs_tage_alloc(GFP_ATOMIC);
157                         if (unlikely(tage == NULL)) {
158                                 if ((!memory_pressure_get() ||
159                                      in_interrupt()) && printk_ratelimit())
160                                         printk(KERN_WARNING
161                                                "cannot allocate a tage (%ld)\n",
162                                                tcd->tcd_cur_pages);
163                                 return NULL;
164                         }
165                 }
166
167                 tage->used = 0;
168                 tage->cpu = smp_processor_id();
169                 tage->type = tcd->tcd_type;
170                 list_add_tail(&tage->linkage, &tcd->tcd_pages);
171                 tcd->tcd_cur_pages++;
172
173                 if (tcd->tcd_cur_pages > 8 && thread_running) {
174                         struct tracefiled_ctl *tctl = &trace_tctl;
175                         /*
176                          * wake up tracefiled to process some pages.
177                          */
178                         wake_up(&tctl->tctl_waitq);
179                 }
180                 return tage;
181         }
182         return NULL;
183 }
184
185 static void cfs_tcd_shrink(struct cfs_trace_cpu_data *tcd)
186 {
187         int pgcount = tcd->tcd_cur_pages / 10;
188         struct page_collection pc;
189         struct cfs_trace_page *tage;
190         struct cfs_trace_page *tmp;
191
192         /*
193          * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
194          * from here: this will lead to infinite recursion.
195          */
196
197         if (printk_ratelimit())
198                 printk(KERN_WARNING "debug daemon buffer overflowed; discarding 10%% of pages (%d of %ld)\n",
199                        pgcount + 1, tcd->tcd_cur_pages);
200
201         INIT_LIST_HEAD(&pc.pc_pages);
202
203         list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) {
204                 if (pgcount-- == 0)
205                         break;
206
207                 list_move_tail(&tage->linkage, &pc.pc_pages);
208                 tcd->tcd_cur_pages--;
209         }
210         put_pages_on_tcd_daemon_list(&pc, tcd);
211 }
212
213 /* return a page that has 'len' bytes left at the end */
214 static struct cfs_trace_page *cfs_trace_get_tage(struct cfs_trace_cpu_data *tcd,
215                                                  unsigned long len)
216 {
217         struct cfs_trace_page *tage;
218
219         /*
220          * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
221          * from here: this will lead to infinite recursion.
222          */
223
224         if (len > PAGE_CACHE_SIZE) {
225                 pr_err("cowardly refusing to write %lu bytes in a page\n", len);
226                 return NULL;
227         }
228
229         tage = cfs_trace_get_tage_try(tcd, len);
230         if (tage != NULL)
231                 return tage;
232         if (thread_running)
233                 cfs_tcd_shrink(tcd);
234         if (tcd->tcd_cur_pages > 0) {
235                 tage = cfs_tage_from_list(tcd->tcd_pages.next);
236                 tage->used = 0;
237                 cfs_tage_to_tail(tage, &tcd->tcd_pages);
238         }
239         return tage;
240 }
241
242 int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata,
243                      const char *format, ...)
244 {
245         va_list args;
246         int     rc;
247
248         va_start(args, format);
249         rc = libcfs_debug_vmsg2(msgdata, format, args, NULL);
250         va_end(args);
251
252         return rc;
253 }
254 EXPORT_SYMBOL(libcfs_debug_msg);
255
256 int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
257                        const char *format1, va_list args,
258                        const char *format2, ...)
259 {
260         struct cfs_trace_cpu_data *tcd = NULL;
261         struct ptldebug_header     header = {0};
262         struct cfs_trace_page     *tage;
263         /* string_buf is used only if tcd != NULL, and is always set then */
264         char                  *string_buf = NULL;
265         char                  *debug_buf;
266         int                     known_size;
267         int                     needed = 85; /* average message length */
268         int                     max_nob;
269         va_list             ap;
270         int                     depth;
271         int                     i;
272         int                     remain;
273         int                     mask = msgdata->msg_mask;
274         const char              *file = kbasename(msgdata->msg_file);
275         struct cfs_debug_limit_state   *cdls = msgdata->msg_cdls;
276
277         tcd = cfs_trace_get_tcd();
278
279         /* cfs_trace_get_tcd() grabs a lock, which disables preemption and
280          * pins us to a particular CPU.  This avoids an smp_processor_id()
281          * warning on Linux when debugging is enabled. */
282         cfs_set_ptldebug_header(&header, msgdata, CDEBUG_STACK());
283
284         if (tcd == NULL)                /* arch may not log in IRQ context */
285                 goto console;
286
287         if (tcd->tcd_cur_pages == 0)
288                 header.ph_flags |= PH_FLAG_FIRST_RECORD;
289
290         if (tcd->tcd_shutting_down) {
291                 cfs_trace_put_tcd(tcd);
292                 tcd = NULL;
293                 goto console;
294         }
295
296         depth = __current_nesting_level();
297         known_size = strlen(file) + 1 + depth;
298         if (msgdata->msg_fn)
299                 known_size += strlen(msgdata->msg_fn) + 1;
300
301         if (libcfs_debug_binary)
302                 known_size += sizeof(header);
303
304         /*/
305          * '2' used because vsnprintf return real size required for output
306          * _without_ terminating NULL.
307          * if needed is to small for this format.
308          */
309         for (i = 0; i < 2; i++) {
310                 tage = cfs_trace_get_tage(tcd, needed + known_size + 1);
311                 if (tage == NULL) {
312                         if (needed + known_size > PAGE_CACHE_SIZE)
313                                 mask |= D_ERROR;
314
315                         cfs_trace_put_tcd(tcd);
316                         tcd = NULL;
317                         goto console;
318                 }
319
320                 string_buf = (char *)page_address(tage->page) +
321                                         tage->used + known_size;
322
323                 max_nob = PAGE_CACHE_SIZE - tage->used - known_size;
324                 if (max_nob <= 0) {
325                         printk(KERN_EMERG "negative max_nob: %d\n",
326                                max_nob);
327                         mask |= D_ERROR;
328                         cfs_trace_put_tcd(tcd);
329                         tcd = NULL;
330                         goto console;
331                 }
332
333                 needed = 0;
334                 if (format1) {
335                         va_copy(ap, args);
336                         needed = vsnprintf(string_buf, max_nob, format1, ap);
337                         va_end(ap);
338                 }
339
340                 if (format2) {
341                         remain = max_nob - needed;
342                         if (remain < 0)
343                                 remain = 0;
344
345                         va_start(ap, format2);
346                         needed += vsnprintf(string_buf + needed, remain,
347                                             format2, ap);
348                         va_end(ap);
349                 }
350
351                 if (needed < max_nob) /* well. printing ok.. */
352                         break;
353         }
354
355         if (*(string_buf+needed-1) != '\n')
356                 printk(KERN_INFO "format at %s:%d:%s doesn't end in newline\n",
357                        file, msgdata->msg_line, msgdata->msg_fn);
358
359         header.ph_len = known_size + needed;
360         debug_buf = (char *)page_address(tage->page) + tage->used;
361
362         if (libcfs_debug_binary) {
363                 memcpy(debug_buf, &header, sizeof(header));
364                 tage->used += sizeof(header);
365                 debug_buf += sizeof(header);
366         }
367
368         /* indent message according to the nesting level */
369         while (depth-- > 0) {
370                 *(debug_buf++) = '.';
371                 ++tage->used;
372         }
373
374         strcpy(debug_buf, file);
375         tage->used += strlen(file) + 1;
376         debug_buf += strlen(file) + 1;
377
378         if (msgdata->msg_fn) {
379                 strcpy(debug_buf, msgdata->msg_fn);
380                 tage->used += strlen(msgdata->msg_fn) + 1;
381                 debug_buf += strlen(msgdata->msg_fn) + 1;
382         }
383
384         __LASSERT(debug_buf == string_buf);
385
386         tage->used += needed;
387         __LASSERT (tage->used <= PAGE_CACHE_SIZE);
388
389 console:
390         if ((mask & libcfs_printk) == 0) {
391                 /* no console output requested */
392                 if (tcd != NULL)
393                         cfs_trace_put_tcd(tcd);
394                 return 1;
395         }
396
397         if (cdls != NULL) {
398                 if (libcfs_console_ratelimit &&
399                     cdls->cdls_next != 0 &&     /* not first time ever */
400                     !cfs_time_after(cfs_time_current(), cdls->cdls_next)) {
401                         /* skipping a console message */
402                         cdls->cdls_count++;
403                         if (tcd != NULL)
404                                 cfs_trace_put_tcd(tcd);
405                         return 1;
406                 }
407
408                 if (cfs_time_after(cfs_time_current(), cdls->cdls_next +
409                                                        libcfs_console_max_delay
410                                                        + cfs_time_seconds(10))) {
411                         /* last timeout was a long time ago */
412                         cdls->cdls_delay /= libcfs_console_backoff * 4;
413                 } else {
414                         cdls->cdls_delay *= libcfs_console_backoff;
415                 }
416
417                 if (cdls->cdls_delay < libcfs_console_min_delay)
418                         cdls->cdls_delay = libcfs_console_min_delay;
419                 else if (cdls->cdls_delay > libcfs_console_max_delay)
420                         cdls->cdls_delay = libcfs_console_max_delay;
421
422                 /* ensure cdls_next is never zero after it's been seen */
423                 cdls->cdls_next = (cfs_time_current() + cdls->cdls_delay) | 1;
424         }
425
426         if (tcd != NULL) {
427                 cfs_print_to_console(&header, mask, string_buf, needed, file,
428                                      msgdata->msg_fn);
429                 cfs_trace_put_tcd(tcd);
430         } else {
431                 string_buf = cfs_trace_get_console_buffer();
432
433                 needed = 0;
434                 if (format1 != NULL) {
435                         va_copy(ap, args);
436                         needed = vsnprintf(string_buf,
437                                            CFS_TRACE_CONSOLE_BUFFER_SIZE,
438                                            format1, ap);
439                         va_end(ap);
440                 }
441                 if (format2 != NULL) {
442                         remain = CFS_TRACE_CONSOLE_BUFFER_SIZE - needed;
443                         if (remain > 0) {
444                                 va_start(ap, format2);
445                                 needed += vsnprintf(string_buf+needed, remain,
446                                                     format2, ap);
447                                 va_end(ap);
448                         }
449                 }
450                 cfs_print_to_console(&header, mask,
451                                      string_buf, needed, file, msgdata->msg_fn);
452
453                 put_cpu();
454         }
455
456         if (cdls != NULL && cdls->cdls_count != 0) {
457                 string_buf = cfs_trace_get_console_buffer();
458
459                 needed = snprintf(string_buf, CFS_TRACE_CONSOLE_BUFFER_SIZE,
460                                   "Skipped %d previous similar message%s\n",
461                                   cdls->cdls_count,
462                                   (cdls->cdls_count > 1) ? "s" : "");
463
464                 cfs_print_to_console(&header, mask,
465                                      string_buf, needed, file, msgdata->msg_fn);
466
467                 put_cpu();
468                 cdls->cdls_count = 0;
469         }
470
471         return 0;
472 }
473 EXPORT_SYMBOL(libcfs_debug_vmsg2);
474
475 void
476 cfs_trace_assertion_failed(const char *str,
477                            struct libcfs_debug_msg_data *msgdata)
478 {
479         struct ptldebug_header hdr;
480
481         libcfs_panic_in_progress = 1;
482         libcfs_catastrophe = 1;
483         mb();
484
485         cfs_set_ptldebug_header(&hdr, msgdata, CDEBUG_STACK());
486
487         cfs_print_to_console(&hdr, D_EMERG, str, strlen(str),
488                              msgdata->msg_file, msgdata->msg_fn);
489
490         panic("Lustre debug assertion failure\n");
491
492         /* not reached */
493 }
494
495 static void
496 panic_collect_pages(struct page_collection *pc)
497 {
498         /* Do the collect_pages job on a single CPU: assumes that all other
499          * CPUs have been stopped during a panic.  If this isn't true for some
500          * arch, this will have to be implemented separately in each arch.  */
501         int                     i;
502         int                     j;
503         struct cfs_trace_cpu_data *tcd;
504
505         INIT_LIST_HEAD(&pc->pc_pages);
506
507         cfs_tcd_for_each(tcd, i, j) {
508                 list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
509                 tcd->tcd_cur_pages = 0;
510
511                 if (pc->pc_want_daemon_pages) {
512                         list_splice_init(&tcd->tcd_daemon_pages,
513                                              &pc->pc_pages);
514                         tcd->tcd_cur_daemon_pages = 0;
515                 }
516         }
517 }
518
519 static void collect_pages_on_all_cpus(struct page_collection *pc)
520 {
521         struct cfs_trace_cpu_data *tcd;
522         int i, cpu;
523
524         for_each_possible_cpu(cpu) {
525                 cfs_tcd_for_each_type_lock(tcd, i, cpu) {
526                         list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
527                         tcd->tcd_cur_pages = 0;
528                         if (pc->pc_want_daemon_pages) {
529                                 list_splice_init(&tcd->tcd_daemon_pages,
530                                                      &pc->pc_pages);
531                                 tcd->tcd_cur_daemon_pages = 0;
532                         }
533                 }
534         }
535 }
536
537 static void collect_pages(struct page_collection *pc)
538 {
539         INIT_LIST_HEAD(&pc->pc_pages);
540
541         if (libcfs_panic_in_progress)
542                 panic_collect_pages(pc);
543         else
544                 collect_pages_on_all_cpus(pc);
545 }
546
547 static void put_pages_back_on_all_cpus(struct page_collection *pc)
548 {
549         struct cfs_trace_cpu_data *tcd;
550         struct list_head *cur_head;
551         struct cfs_trace_page *tage;
552         struct cfs_trace_page *tmp;
553         int i, cpu;
554
555         for_each_possible_cpu(cpu) {
556                 cfs_tcd_for_each_type_lock(tcd, i, cpu) {
557                         cur_head = tcd->tcd_pages.next;
558
559                         list_for_each_entry_safe(tage, tmp, &pc->pc_pages,
560                                                  linkage) {
561
562                                 __LASSERT_TAGE_INVARIANT(tage);
563
564                                 if (tage->cpu != cpu || tage->type != i)
565                                         continue;
566
567                                 cfs_tage_to_tail(tage, cur_head);
568                                 tcd->tcd_cur_pages++;
569                         }
570                 }
571         }
572 }
573
574 static void put_pages_back(struct page_collection *pc)
575 {
576         if (!libcfs_panic_in_progress)
577                 put_pages_back_on_all_cpus(pc);
578 }
579
580 /* Add pages to a per-cpu debug daemon ringbuffer.  This buffer makes sure that
581  * we have a good amount of data at all times for dumping during an LBUG, even
582  * if we have been steadily writing (and otherwise discarding) pages via the
583  * debug daemon. */
584 static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
585                                          struct cfs_trace_cpu_data *tcd)
586 {
587         struct cfs_trace_page *tage;
588         struct cfs_trace_page *tmp;
589
590         list_for_each_entry_safe(tage, tmp, &pc->pc_pages, linkage) {
591
592                 __LASSERT_TAGE_INVARIANT(tage);
593
594                 if (tage->cpu != tcd->tcd_cpu || tage->type != tcd->tcd_type)
595                         continue;
596
597                 cfs_tage_to_tail(tage, &tcd->tcd_daemon_pages);
598                 tcd->tcd_cur_daemon_pages++;
599
600                 if (tcd->tcd_cur_daemon_pages > tcd->tcd_max_pages) {
601                         struct cfs_trace_page *victim;
602
603                         __LASSERT(!list_empty(&tcd->tcd_daemon_pages));
604                         victim = cfs_tage_from_list(tcd->tcd_daemon_pages.next);
605
606                         __LASSERT_TAGE_INVARIANT(victim);
607
608                         list_del(&victim->linkage);
609                         cfs_tage_free(victim);
610                         tcd->tcd_cur_daemon_pages--;
611                 }
612         }
613 }
614
615 static void put_pages_on_daemon_list(struct page_collection *pc)
616 {
617         struct cfs_trace_cpu_data *tcd;
618         int i, cpu;
619
620         for_each_possible_cpu(cpu) {
621                 cfs_tcd_for_each_type_lock(tcd, i, cpu)
622                         put_pages_on_tcd_daemon_list(pc, tcd);
623         }
624 }
625
626 void cfs_trace_debug_print(void)
627 {
628         struct page_collection pc;
629         struct cfs_trace_page *tage;
630         struct cfs_trace_page *tmp;
631
632         pc.pc_want_daemon_pages = 1;
633         collect_pages(&pc);
634         list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
635                 char *p, *file, *fn;
636                 struct page *page;
637
638                 __LASSERT_TAGE_INVARIANT(tage);
639
640                 page = tage->page;
641                 p = page_address(page);
642                 while (p < ((char *)page_address(page) + tage->used)) {
643                         struct ptldebug_header *hdr;
644                         int len;
645
646                         hdr = (void *)p;
647                         p += sizeof(*hdr);
648                         file = p;
649                         p += strlen(file) + 1;
650                         fn = p;
651                         p += strlen(fn) + 1;
652                         len = hdr->ph_len - (int)(p - (char *)hdr);
653
654                         cfs_print_to_console(hdr, D_EMERG, p, len, file, fn);
655
656                         p += len;
657                 }
658
659                 list_del(&tage->linkage);
660                 cfs_tage_free(tage);
661         }
662 }
663
664 int cfs_tracefile_dump_all_pages(char *filename)
665 {
666         struct page_collection  pc;
667         struct file             *filp;
668         struct cfs_trace_page   *tage;
669         struct cfs_trace_page   *tmp;
670         char                    *buf;
671         int rc;
672
673         DECL_MMSPACE;
674
675         cfs_tracefile_write_lock();
676
677         filp = filp_open(filename, O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600);
678         if (IS_ERR(filp)) {
679                 rc = PTR_ERR(filp);
680                 filp = NULL;
681                 pr_err("LustreError: can't open %s for dump: rc %d\n",
682                         filename, rc);
683                 goto out;
684         }
685
686         pc.pc_want_daemon_pages = 1;
687         collect_pages(&pc);
688         if (list_empty(&pc.pc_pages)) {
689                 rc = 0;
690                 goto close;
691         }
692
693         /* ok, for now, just write the pages.  in the future we'll be building
694          * iobufs with the pages and calling generic_direct_IO */
695         MMSPACE_OPEN;
696         list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
697
698                 __LASSERT_TAGE_INVARIANT(tage);
699
700                 buf = kmap(tage->page);
701                 rc = vfs_write(filp, (__force const char __user *)buf,
702                                tage->used, &filp->f_pos);
703                 kunmap(tage->page);
704
705                 if (rc != (int)tage->used) {
706                         printk(KERN_WARNING "wanted to write %u but wrote %d\n",
707                                tage->used, rc);
708                         put_pages_back(&pc);
709                         __LASSERT(list_empty(&pc.pc_pages));
710                         break;
711                 }
712                 list_del(&tage->linkage);
713                 cfs_tage_free(tage);
714         }
715         MMSPACE_CLOSE;
716         rc = vfs_fsync(filp, 1);
717         if (rc)
718                 pr_err("sync returns %d\n", rc);
719 close:
720         filp_close(filp, NULL);
721 out:
722         cfs_tracefile_write_unlock();
723         return rc;
724 }
725
726 void cfs_trace_flush_pages(void)
727 {
728         struct page_collection pc;
729         struct cfs_trace_page *tage;
730         struct cfs_trace_page *tmp;
731
732         pc.pc_want_daemon_pages = 1;
733         collect_pages(&pc);
734         list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
735
736                 __LASSERT_TAGE_INVARIANT(tage);
737
738                 list_del(&tage->linkage);
739                 cfs_tage_free(tage);
740         }
741 }
742
743 int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
744                             const char __user *usr_buffer, int usr_buffer_nob)
745 {
746         int    nob;
747
748         if (usr_buffer_nob > knl_buffer_nob)
749                 return -EOVERFLOW;
750
751         if (copy_from_user((void *)knl_buffer,
752                            usr_buffer, usr_buffer_nob))
753                 return -EFAULT;
754
755         nob = strnlen(knl_buffer, usr_buffer_nob);
756         while (nob-- >= 0)                    /* strip trailing whitespace */
757                 if (!isspace(knl_buffer[nob]))
758                         break;
759
760         if (nob < 0)                        /* empty string */
761                 return -EINVAL;
762
763         if (nob == knl_buffer_nob)            /* no space to terminate */
764                 return -EOVERFLOW;
765
766         knl_buffer[nob + 1] = 0;                /* terminate */
767         return 0;
768 }
769 EXPORT_SYMBOL(cfs_trace_copyin_string);
770
771 int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
772                              const char *knl_buffer, char *append)
773 {
774         /* NB if 'append' != NULL, it's a single character to append to the
775          * copied out string - usually "\n", for /proc entries and "" (i.e. a
776          * terminating zero byte) for sysctl entries */
777         int   nob = strlen(knl_buffer);
778
779         if (nob > usr_buffer_nob)
780                 nob = usr_buffer_nob;
781
782         if (copy_to_user(usr_buffer, knl_buffer, nob))
783                 return -EFAULT;
784
785         if (append != NULL && nob < usr_buffer_nob) {
786                 if (copy_to_user(usr_buffer + nob, append, 1))
787                         return -EFAULT;
788
789                 nob++;
790         }
791
792         return nob;
793 }
794 EXPORT_SYMBOL(cfs_trace_copyout_string);
795
796 int cfs_trace_allocate_string_buffer(char **str, int nob)
797 {
798         if (nob > 2 * PAGE_CACHE_SIZE)      /* string must be "sensible" */
799                 return -EINVAL;
800
801         *str = kmalloc(nob, GFP_KERNEL | __GFP_ZERO);
802         if (*str == NULL)
803                 return -ENOMEM;
804
805         return 0;
806 }
807
808 int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob)
809 {
810         char     *str;
811         int        rc;
812
813         rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1);
814         if (rc != 0)
815                 return rc;
816
817         rc = cfs_trace_copyin_string(str, usr_str_nob + 1,
818                                      usr_str, usr_str_nob);
819         if (rc != 0)
820                 goto out;
821
822         if (str[0] != '/') {
823                 rc = -EINVAL;
824                 goto out;
825         }
826         rc = cfs_tracefile_dump_all_pages(str);
827 out:
828         kfree(str);
829         return rc;
830 }
831
832 int cfs_trace_daemon_command(char *str)
833 {
834         int       rc = 0;
835
836         cfs_tracefile_write_lock();
837
838         if (strcmp(str, "stop") == 0) {
839                 cfs_tracefile_write_unlock();
840                 cfs_trace_stop_thread();
841                 cfs_tracefile_write_lock();
842                 memset(cfs_tracefile, 0, sizeof(cfs_tracefile));
843
844         } else if (strncmp(str, "size=", 5) == 0) {
845                 cfs_tracefile_size = simple_strtoul(str + 5, NULL, 0);
846                 if (cfs_tracefile_size < 10 || cfs_tracefile_size > 20480)
847                         cfs_tracefile_size = CFS_TRACEFILE_SIZE;
848                 else
849                         cfs_tracefile_size <<= 20;
850
851         } else if (strlen(str) >= sizeof(cfs_tracefile)) {
852                 rc = -ENAMETOOLONG;
853         } else if (str[0] != '/') {
854                 rc = -EINVAL;
855         } else {
856                 strcpy(cfs_tracefile, str);
857
858                 printk(KERN_INFO
859                        "Lustre: debug daemon will attempt to start writing to %s (%lukB max)\n",
860                        cfs_tracefile,
861                        (long)(cfs_tracefile_size >> 10));
862
863                 cfs_trace_start_thread();
864         }
865
866         cfs_tracefile_write_unlock();
867         return rc;
868 }
869
870 int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob)
871 {
872         char *str;
873         int   rc;
874
875         rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1);
876         if (rc != 0)
877                 return rc;
878
879         rc = cfs_trace_copyin_string(str, usr_str_nob + 1,
880                                  usr_str, usr_str_nob);
881         if (rc == 0)
882                 rc = cfs_trace_daemon_command(str);
883
884         kfree(str);
885         return rc;
886 }
887
888 int cfs_trace_set_debug_mb(int mb)
889 {
890         int i;
891         int j;
892         int pages;
893         int limit = cfs_trace_max_debug_mb();
894         struct cfs_trace_cpu_data *tcd;
895
896         if (mb < num_possible_cpus()) {
897                 printk(KERN_WARNING
898                        "Lustre: %d MB is too small for debug buffer size, setting it to %d MB.\n",
899                        mb, num_possible_cpus());
900                 mb = num_possible_cpus();
901         }
902
903         if (mb > limit) {
904                 printk(KERN_WARNING
905                        "Lustre: %d MB is too large for debug buffer size, setting it to %d MB.\n",
906                        mb, limit);
907                 mb = limit;
908         }
909
910         mb /= num_possible_cpus();
911         pages = mb << (20 - PAGE_CACHE_SHIFT);
912
913         cfs_tracefile_write_lock();
914
915         cfs_tcd_for_each(tcd, i, j)
916                 tcd->tcd_max_pages = (pages * tcd->tcd_pages_factor) / 100;
917
918         cfs_tracefile_write_unlock();
919
920         return 0;
921 }
922
923 int cfs_trace_get_debug_mb(void)
924 {
925         int i;
926         int j;
927         struct cfs_trace_cpu_data *tcd;
928         int total_pages = 0;
929
930         cfs_tracefile_read_lock();
931
932         cfs_tcd_for_each(tcd, i, j)
933                 total_pages += tcd->tcd_max_pages;
934
935         cfs_tracefile_read_unlock();
936
937         return (total_pages >> (20 - PAGE_CACHE_SHIFT)) + 1;
938 }
939
940 static int tracefiled(void *arg)
941 {
942         struct page_collection pc;
943         struct tracefiled_ctl *tctl = arg;
944         struct cfs_trace_page *tage;
945         struct cfs_trace_page *tmp;
946         struct file *filp;
947         char *buf;
948         int last_loop = 0;
949         int rc;
950
951         DECL_MMSPACE;
952
953         /* we're started late enough that we pick up init's fs context */
954         /* this is so broken in uml?  what on earth is going on? */
955
956         complete(&tctl->tctl_start);
957
958         while (1) {
959                 wait_queue_t __wait;
960
961                 pc.pc_want_daemon_pages = 0;
962                 collect_pages(&pc);
963                 if (list_empty(&pc.pc_pages))
964                         goto end_loop;
965
966                 filp = NULL;
967                 cfs_tracefile_read_lock();
968                 if (cfs_tracefile[0] != 0) {
969                         filp = filp_open(cfs_tracefile,
970                                          O_CREAT | O_RDWR | O_LARGEFILE,
971                                          0600);
972                         if (IS_ERR(filp)) {
973                                 rc = PTR_ERR(filp);
974                                 filp = NULL;
975                                 printk(KERN_WARNING "couldn't open %s: %d\n",
976                                        cfs_tracefile, rc);
977                         }
978                 }
979                 cfs_tracefile_read_unlock();
980                 if (filp == NULL) {
981                         put_pages_on_daemon_list(&pc);
982                         __LASSERT(list_empty(&pc.pc_pages));
983                         goto end_loop;
984                 }
985
986                 MMSPACE_OPEN;
987
988                 list_for_each_entry_safe(tage, tmp, &pc.pc_pages,
989                                                    linkage) {
990                         static loff_t f_pos;
991
992                         __LASSERT_TAGE_INVARIANT(tage);
993
994                         if (f_pos >= (off_t)cfs_tracefile_size)
995                                 f_pos = 0;
996                         else if (f_pos > i_size_read(file_inode(filp)))
997                                 f_pos = i_size_read(file_inode(filp));
998
999                         buf = kmap(tage->page);
1000                         rc = vfs_write(filp, (__force const char __user *)buf,
1001                                        tage->used, &f_pos);
1002                         kunmap(tage->page);
1003
1004                         if (rc != (int)tage->used) {
1005                                 printk(KERN_WARNING "wanted to write %u but wrote %d\n",
1006                                        tage->used, rc);
1007                                 put_pages_back(&pc);
1008                                 __LASSERT(list_empty(&pc.pc_pages));
1009                                 break;
1010                         }
1011                 }
1012                 MMSPACE_CLOSE;
1013
1014                 filp_close(filp, NULL);
1015                 put_pages_on_daemon_list(&pc);
1016                 if (!list_empty(&pc.pc_pages)) {
1017                         int i;
1018
1019                         printk(KERN_ALERT "Lustre: trace pages aren't empty\n");
1020                         pr_err("total cpus(%d): ",
1021                                 num_possible_cpus());
1022                         for (i = 0; i < num_possible_cpus(); i++)
1023                                 if (cpu_online(i))
1024                                         pr_cont("%d(on) ", i);
1025                                 else
1026                                         pr_cont("%d(off) ", i);
1027                         pr_cont("\n");
1028
1029                         i = 0;
1030                         list_for_each_entry_safe(tage, tmp, &pc.pc_pages,
1031                                                      linkage)
1032                                 pr_err("page %d belongs to cpu %d\n",
1033                                         ++i, tage->cpu);
1034                         pr_err("There are %d pages unwritten\n", i);
1035                 }
1036                 __LASSERT(list_empty(&pc.pc_pages));
1037 end_loop:
1038                 if (atomic_read(&tctl->tctl_shutdown)) {
1039                         if (last_loop == 0) {
1040                                 last_loop = 1;
1041                                 continue;
1042                         } else {
1043                                 break;
1044                         }
1045                 }
1046                 init_waitqueue_entry(&__wait, current);
1047                 add_wait_queue(&tctl->tctl_waitq, &__wait);
1048                 set_current_state(TASK_INTERRUPTIBLE);
1049                 schedule_timeout(cfs_time_seconds(1));
1050                 remove_wait_queue(&tctl->tctl_waitq, &__wait);
1051         }
1052         complete(&tctl->tctl_stop);
1053         return 0;
1054 }
1055
1056 int cfs_trace_start_thread(void)
1057 {
1058         struct tracefiled_ctl *tctl = &trace_tctl;
1059         struct task_struct *task;
1060         int rc = 0;
1061
1062         mutex_lock(&cfs_trace_thread_mutex);
1063         if (thread_running)
1064                 goto out;
1065
1066         init_completion(&tctl->tctl_start);
1067         init_completion(&tctl->tctl_stop);
1068         init_waitqueue_head(&tctl->tctl_waitq);
1069         atomic_set(&tctl->tctl_shutdown, 0);
1070
1071         task = kthread_run(tracefiled, tctl, "ktracefiled");
1072         if (IS_ERR(task)) {
1073                 rc = PTR_ERR(task);
1074                 goto out;
1075         }
1076
1077         wait_for_completion(&tctl->tctl_start);
1078         thread_running = 1;
1079 out:
1080         mutex_unlock(&cfs_trace_thread_mutex);
1081         return rc;
1082 }
1083
1084 void cfs_trace_stop_thread(void)
1085 {
1086         struct tracefiled_ctl *tctl = &trace_tctl;
1087
1088         mutex_lock(&cfs_trace_thread_mutex);
1089         if (thread_running) {
1090                 printk(KERN_INFO
1091                        "Lustre: shutting down debug daemon thread...\n");
1092                 atomic_set(&tctl->tctl_shutdown, 1);
1093                 wait_for_completion(&tctl->tctl_stop);
1094                 thread_running = 0;
1095         }
1096         mutex_unlock(&cfs_trace_thread_mutex);
1097 }
1098
1099 int cfs_tracefile_init(int max_pages)
1100 {
1101         struct cfs_trace_cpu_data *tcd;
1102         int                 i;
1103         int                 j;
1104         int                 rc;
1105         int                 factor;
1106
1107         rc = cfs_tracefile_init_arch();
1108         if (rc != 0)
1109                 return rc;
1110
1111         cfs_tcd_for_each(tcd, i, j) {
1112                 /* tcd_pages_factor is initialized int tracefile_init_arch. */
1113                 factor = tcd->tcd_pages_factor;
1114                 INIT_LIST_HEAD(&tcd->tcd_pages);
1115                 INIT_LIST_HEAD(&tcd->tcd_stock_pages);
1116                 INIT_LIST_HEAD(&tcd->tcd_daemon_pages);
1117                 tcd->tcd_cur_pages = 0;
1118                 tcd->tcd_cur_stock_pages = 0;
1119                 tcd->tcd_cur_daemon_pages = 0;
1120                 tcd->tcd_max_pages = (max_pages * factor) / 100;
1121                 LASSERT(tcd->tcd_max_pages > 0);
1122                 tcd->tcd_shutting_down = 0;
1123         }
1124
1125         return 0;
1126 }
1127
1128 static void trace_cleanup_on_all_cpus(void)
1129 {
1130         struct cfs_trace_cpu_data *tcd;
1131         struct cfs_trace_page *tage;
1132         struct cfs_trace_page *tmp;
1133         int i, cpu;
1134
1135         for_each_possible_cpu(cpu) {
1136                 cfs_tcd_for_each_type_lock(tcd, i, cpu) {
1137                         tcd->tcd_shutting_down = 1;
1138
1139                         list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages,
1140                                                            linkage) {
1141                                 __LASSERT_TAGE_INVARIANT(tage);
1142
1143                                 list_del(&tage->linkage);
1144                                 cfs_tage_free(tage);
1145                         }
1146
1147                         tcd->tcd_cur_pages = 0;
1148                 }
1149         }
1150 }
1151
1152 static void cfs_trace_cleanup(void)
1153 {
1154         struct page_collection pc;
1155
1156         INIT_LIST_HEAD(&pc.pc_pages);
1157
1158         trace_cleanup_on_all_cpus();
1159
1160         cfs_tracefile_fini_arch();
1161 }
1162
1163 void cfs_tracefile_exit(void)
1164 {
1165         cfs_trace_stop_thread();
1166         cfs_trace_cleanup();
1167 }