--- /dev/null
+/*
+ * Blockconsole - write kernel console to a block device
+ *
+ * Copyright (C) 2012 Joern Engel <joern@logfs.org>
+ */
+#include <linux/bio.h>
+#include <linux/blockconsole.h>
+#include <linux/console.h>
+#include <linux/fs.h>
+#include <linux/kref.h>
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/mount.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/ctype.h>
+
+#define BLOCKCONSOLE_MAGIC_OLD "\nLinux blockconsole version 1.0\n"
+#define BLOCKCONSOLE_MAGIC "\nLinux blockconsole version 1.1\n"
+#define BCON_UUID_OFS (32)
+#define BCON_ROUND_OFS (41)
+#define BCON_TILE_OFS (50)
+#define BCON_HEADERSIZE (50)
+#define BCON_LONG_HEADERSIZE (59) /* with tile index */
+
+#define PAGE_COUNT (256)
+#define SECTOR_COUNT (PAGE_COUNT * (PAGE_SIZE >> 9))
+#define CACHE_PAGE_MASK (PAGE_COUNT - 1)
+#define CACHE_SECTOR_MASK (SECTOR_COUNT - 1)
+#define CACHE_SIZE (PAGE_COUNT << PAGE_SHIFT)
+#define CACHE_MASK (CACHE_SIZE - 1)
+#define SECTOR_SHIFT (9)
+#define SECTOR_SIZE (1 << SECTOR_SHIFT)
+#define SECTOR_MASK (~(SECTOR_SIZE-1))
+#define PG_SECTOR_MASK ((PAGE_SIZE >> 9) - 1)
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+struct bcon_bio {
+ struct bio bio;
+ struct bio_vec bvec;
+ void *sector;
+ int in_flight;
+};
+
+struct blockconsole {
+ char devname[32];
+ spinlock_t end_io_lock;
+ struct timer_list pad_timer;
+ int error_count;
+ struct kref kref;
+ u64 console_bytes;
+ u64 write_bytes;
+ u64 max_bytes;
+ u32 round;
+ u32 uuid;
+ struct bcon_bio bio_array[SECTOR_COUNT];
+ struct page *pages;
+ struct bcon_bio zero_bios[PAGE_COUNT];
+ struct page *zero_page;
+ struct block_device *bdev;
+ struct console console;
+ struct work_struct unregister_work;
+ struct task_struct *writeback_thread;
+ struct notifier_block panic_block;
+};
+
+static void bcon_get(struct blockconsole *bc)
+{
+ kref_get(&bc->kref);
+}
+
+static void bcon_release(struct kref *kref)
+{
+ struct blockconsole *bc = container_of(kref, struct blockconsole, kref);
+
+ __free_pages(bc->zero_page, 0);
+ __free_pages(bc->pages, 8);
+ invalidate_mapping_pages(bc->bdev->bd_inode->i_mapping, 0, -1);
+ blkdev_put(bc->bdev, FMODE_READ|FMODE_WRITE);
+ kfree(bc);
+}
+
+static void bcon_put(struct blockconsole *bc)
+{
+ kref_put(&bc->kref, bcon_release);
+}
+
+static int __bcon_console_ofs(u64 console_bytes)
+{
+ return console_bytes & ~SECTOR_MASK;
+}
+
+static int bcon_console_ofs(struct blockconsole *bc)
+{
+ return __bcon_console_ofs(bc->console_bytes);
+}
+
+static int __bcon_console_sector(u64 console_bytes)
+{
+ return (console_bytes >> SECTOR_SHIFT) & CACHE_SECTOR_MASK;
+}
+
+static int bcon_console_sector(struct blockconsole *bc)
+{
+ return __bcon_console_sector(bc->console_bytes);
+}
+
+static int bcon_write_sector(struct blockconsole *bc)
+{
+ return (bc->write_bytes >> SECTOR_SHIFT) & CACHE_SECTOR_MASK;
+}
+
+static void clear_sector(void *sector)
+{
+ memset(sector, ' ', 511);
+ memset(sector + 511, 10, 1);
+}
+
+static void bcon_init_first_page(struct blockconsole *bc)
+{
+ char *buf = page_address(bc->pages);
+ size_t len = strlen(BLOCKCONSOLE_MAGIC);
+ u32 tile = bc->console_bytes >> 20; /* We overflow after 4TB - fine */
+
+ clear_sector(buf);
+ memcpy(buf, BLOCKCONSOLE_MAGIC, len);
+ sprintf(buf + BCON_UUID_OFS, "%08x", bc->uuid);
+ sprintf(buf + BCON_ROUND_OFS, "%08x", bc->round);
+ sprintf(buf + BCON_TILE_OFS, "%08x", tile);
+ /* replace NUL with newline */
+ buf[BCON_UUID_OFS + 8] = 10;
+ buf[BCON_ROUND_OFS + 8] = 10;
+ buf[BCON_TILE_OFS + 8] = 10;
+}
+
+static void bcon_advance_console_bytes(struct blockconsole *bc, int bytes)
+{
+ u64 old, new;
+
+ do {
+ old = bc->console_bytes;
+ new = old + bytes;
+ if (new >= bc->max_bytes)
+ new = 0;
+ if ((new & CACHE_MASK) == 0) {
+ bcon_init_first_page(bc);
+ new += BCON_LONG_HEADERSIZE;
+ }
+ } while (cmpxchg64(&bc->console_bytes, old, new) != old);
+}
+
+static void request_complete(struct bio *bio, int err)
+{
+ complete((struct completion *)bio->bi_private);
+}
+
+static int sync_read(struct blockconsole *bc, u64 ofs)
+{
+ struct bio bio;
+ struct bio_vec bio_vec;
+ struct completion complete;
+
+ bio_init(&bio);
+ bio.bi_io_vec = &bio_vec;
+ bio_vec.bv_page = bc->pages;
+ bio_vec.bv_len = SECTOR_SIZE;
+ bio_vec.bv_offset = 0;
+ bio.bi_vcnt = 1;
+ bio.bi_idx = 0;
+ bio.bi_size = SECTOR_SIZE;
+ bio.bi_bdev = bc->bdev;
+ bio.bi_sector = ofs >> SECTOR_SHIFT;
+ init_completion(&complete);
+ bio.bi_private = &complete;
+ bio.bi_end_io = request_complete;
+
+ submit_bio(READ, &bio);
+ wait_for_completion(&complete);
+ return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
+}
+
+static void bcon_erase_segment(struct blockconsole *bc)
+{
+ int i;
+
+ for (i = 0; i < PAGE_COUNT; i++) {
+ struct bcon_bio *bcon_bio = bc->zero_bios + i;
+ struct bio *bio = &bcon_bio->bio;
+
+ /*
+ * If the last erase hasn't finished yet, just skip it. The log
+ * will look messy, but that's all.
+ */
+ rmb();
+ if (bcon_bio->in_flight)
+ continue;
+ bio_init(bio);
+ bio->bi_io_vec = &bcon_bio->bvec;
+ bio->bi_vcnt = 1;
+ bio->bi_size = PAGE_SIZE;
+ bio->bi_bdev = bc->bdev;
+ bio->bi_private = bc;
+ bio->bi_idx = 0;
+ bio->bi_sector = (bc->write_bytes + i * PAGE_SIZE) >> 9;
+ bcon_bio->in_flight = 1;
+ wmb();
+ /* We want the erase to go to the device first somehow */
+ submit_bio(WRITE | REQ_SOFTBARRIER, bio);
+ }
+}
+
+static void bcon_advance_write_bytes(struct blockconsole *bc, int bytes)
+{
+ bc->write_bytes += bytes;
+ if (bc->write_bytes >= bc->max_bytes) {
+ bc->write_bytes = 0;
+ bcon_init_first_page(bc);
+ bc->round++;
+ }
+}
+
+static int bcon_convert_old_format(struct blockconsole *bc)
+{
+ bc->uuid = get_random_int();
+ bc->round = 0;
+ bc->console_bytes = bc->write_bytes = 0;
+ bcon_advance_console_bytes(bc, 0); /* To skip the header */
+ bcon_advance_write_bytes(bc, 0); /* To wrap around, if necessary */
+ bcon_erase_segment(bc);
+ pr_info("converted %s from old format\n", bc->devname);
+ return 0;
+}
+
+static int bcon_find_end_of_log(struct blockconsole *bc)
+{
+ u64 start = 0, end = bc->max_bytes, middle;
+ void *sec0 = bc->bio_array[0].sector;
+ void *sec1 = bc->bio_array[1].sector;
+ int err, version;
+
+ err = sync_read(bc, 0);
+ if (err)
+ return err;
+ /* Second sanity check, out of sheer paranoia */
+ version = bcon_magic_present(sec0);
+ if (version == 10)
+ return bcon_convert_old_format(bc);
+
+ bc->uuid = simple_strtoull(sec0 + BCON_UUID_OFS, NULL, 16);
+ bc->round = simple_strtoull(sec0 + BCON_ROUND_OFS, NULL, 16);
+
+ memcpy(sec1, sec0, BCON_HEADERSIZE);
+ for (;;) {
+ middle = (start + end) / 2;
+ middle &= ~CACHE_MASK;
+ if (middle == start)
+ break;
+ err = sync_read(bc, middle);
+ if (err)
+ return err;
+ if (memcmp(sec1, sec0, BCON_HEADERSIZE)) {
+ /* If the two differ, we haven't written that far yet */
+ end = middle;
+ } else {
+ start = middle;
+ }
+ }
+ bc->console_bytes = bc->write_bytes = end;
+ bcon_advance_console_bytes(bc, 0); /* To skip the header */
+ bcon_advance_write_bytes(bc, 0); /* To wrap around, if necessary */
+ bcon_erase_segment(bc);
+ return 0;
+}
+
+static void bcon_unregister(struct work_struct *work)
+{
+ struct blockconsole *bc = container_of(work, struct blockconsole,
+ unregister_work);
+
+ atomic_notifier_chain_unregister(&panic_notifier_list, &bc->panic_block);
+ unregister_console(&bc->console);
+ del_timer_sync(&bc->pad_timer);
+ kthread_stop(bc->writeback_thread);
+ /* No new io will be scheduled anymore now */
+ bcon_put(bc);
+}
+
+#define BCON_MAX_ERRORS 10
+static void bcon_end_io(struct bio *bio, int err)
+{
+ struct bcon_bio *bcon_bio = container_of(bio, struct bcon_bio, bio);
+ struct blockconsole *bc = bio->bi_private;
+ unsigned long flags;
+
+ /*
+ * We want to assume the device broken and free this console if
+ * we accumulate too many errors. But if errors are transient,
+ * we also want to forget about them once writes succeed again.
+ * Oh, and we only want to reset the counter if it hasn't reached
+ * the limit yet, so we don't bcon_put() twice from here.
+ */
+ spin_lock_irqsave(&bc->end_io_lock, flags);
+ if (err) {
+ if (bc->error_count++ == BCON_MAX_ERRORS) {
+ pr_info("no longer logging to %s\n", bc->devname);
+ schedule_work(&bc->unregister_work);
+ }
+ } else {
+ if (bc->error_count && bc->error_count < BCON_MAX_ERRORS)
+ bc->error_count = 0;
+ }
+ /*
+ * Add padding (a bunch of spaces and a newline) early so bcon_pad
+ * only has to advance a pointer.
+ */
+ clear_sector(bcon_bio->sector);
+ bcon_bio->in_flight = 0;
+ spin_unlock_irqrestore(&bc->end_io_lock, flags);
+ bcon_put(bc);
+}
+
+static void bcon_writesector(struct blockconsole *bc, int index)
+{
+ struct bcon_bio *bcon_bio = bc->bio_array + index;
+ struct bio *bio = &bcon_bio->bio;
+
+ rmb();
+ if (bcon_bio->in_flight)
+ return;
+ bcon_get(bc);
+
+ bio_init(bio);
+ bio->bi_io_vec = &bcon_bio->bvec;
+ bio->bi_vcnt = 1;
+ bio->bi_size = SECTOR_SIZE;
+ bio->bi_bdev = bc->bdev;
+ bio->bi_private = bc;
+ bio->bi_end_io = bcon_end_io;
+
+ bio->bi_idx = 0;
+ bio->bi_sector = bc->write_bytes >> 9;
+ bcon_bio->in_flight = 1;
+ wmb();
+ submit_bio(WRITE, bio);
+}
+
+static int bcon_writeback(void *_bc)
+{
+ struct blockconsole *bc = _bc;
+ struct sched_param(sp);
+
+ sp.sched_priority = MAX_RT_PRIO - 1; /* Highest realtime prio */
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ if (kthread_should_stop())
+ break;
+ while (bcon_write_sector(bc) != bcon_console_sector(bc)) {
+ bcon_writesector(bc, bcon_write_sector(bc));
+ bcon_advance_write_bytes(bc, SECTOR_SIZE);
+ if (bcon_write_sector(bc) == 0)
+ bcon_erase_segment(bc);
+ }
+ }
+ return 0;
+}
+
+static void bcon_pad(unsigned long data)
+{
+ struct blockconsole *bc = (void *)data;
+ unsigned int n;
+
+ /*
+ * We deliberately race against bcon_write here. If we lose the race,
+ * our padding is no longer where we expected it to be, i.e. it is
+ * no longer a bunch of spaces with a newline at the end. There could
+ * not be a newline at all or it could be somewhere in the middle.
+ * Either way, the log corruption is fairly obvious to spot and ignore
+ * for human readers.
+ */
+ n = SECTOR_SIZE - bcon_console_ofs(bc);
+ if (n != SECTOR_SIZE) {
+ bcon_advance_console_bytes(bc, n);
+ wake_up_process(bc->writeback_thread);
+ }
+}
+
+static void bcon_write(struct console *console, const char *msg,
+ unsigned int len)
+{
+ struct blockconsole *bc = container_of(console, struct blockconsole,
+ console);
+ unsigned int n;
+ u64 console_bytes;
+ int i;
+
+ while (len) {
+ console_bytes = bc->console_bytes;
+ i = __bcon_console_sector(console_bytes);
+ rmb();
+ if (bc->bio_array[i].in_flight)
+ break;
+ n = min_t(int, len, SECTOR_SIZE -
+ __bcon_console_ofs(console_bytes));
+ memcpy(bc->bio_array[i].sector +
+ __bcon_console_ofs(console_bytes), msg, n);
+ len -= n;
+ msg += n;
+ bcon_advance_console_bytes(bc, n);
+ }
+ wake_up_process(bc->writeback_thread);
+ mod_timer(&bc->pad_timer, jiffies + HZ);
+}
+
+static void bcon_init_bios(struct blockconsole *bc)
+{
+ int i;
+
+ for (i = 0; i < SECTOR_COUNT; i++) {
+ int page_index = i >> (PAGE_SHIFT - SECTOR_SHIFT);
+ struct page *page = bc->pages + page_index;
+ struct bcon_bio *bcon_bio = bc->bio_array + i;
+ struct bio_vec *bvec = &bcon_bio->bvec;
+
+ bcon_bio->in_flight = 0;
+ bcon_bio->sector = page_address(bc->pages + page_index)
+ + SECTOR_SIZE * (i & PG_SECTOR_MASK);
+ clear_sector(bcon_bio->sector);
+ bvec->bv_page = page;
+ bvec->bv_len = SECTOR_SIZE;
+ bvec->bv_offset = SECTOR_SIZE * (i & PG_SECTOR_MASK);
+ }
+}
+
+static void bcon_init_zero_bio(struct blockconsole *bc)
+{
+ int i;
+
+ memset(page_address(bc->zero_page), 0, PAGE_SIZE);
+ for (i = 0; i < PAGE_COUNT; i++) {
+ struct bcon_bio *bcon_bio = bc->zero_bios + i;
+ struct bio_vec *bvec = &bcon_bio->bvec;
+
+ bcon_bio->in_flight = 0;
+ bvec->bv_page = bc->zero_page;
+ bvec->bv_len = PAGE_SIZE;
+ bvec->bv_offset = 0;
+ }
+}
+
+static int blockconsole_panic(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct blockconsole *bc = container_of(this, struct blockconsole,
+ panic_block);
+ unsigned int n;
+
+ n = SECTOR_SIZE - bcon_console_ofs(bc);
+ if (n != SECTOR_SIZE)
+ bcon_advance_console_bytes(bc, n);
+ bcon_writeback(bc);
+ return NOTIFY_DONE;
+}
+
+static int bcon_create(const char *devname)
+{
+ const fmode_t mode = FMODE_READ | FMODE_WRITE;
+ struct blockconsole *bc;
+ int err;
+
+ bc = kzalloc(sizeof(*bc), GFP_KERNEL);
+ if (!bc)
+ return -ENOMEM;
+ memset(bc->devname, ' ', sizeof(bc->devname));
+ strlcpy(bc->devname, devname, sizeof(bc->devname));
+ spin_lock_init(&bc->end_io_lock);
+ strcpy(bc->console.name, "bcon");
+ bc->console.flags = CON_PRINTBUFFER | CON_ENABLED;
+ bc->console.write = bcon_write;
+ bc->bdev = blkdev_get_by_path(devname, mode, NULL);
+#ifndef MODULE
+ if (IS_ERR(bc->bdev)) {
+ dev_t devt = name_to_dev_t(devname);
+ if (devt)
+ bc->bdev = blkdev_get_by_dev(devt, mode, NULL);
+ }
+#endif
+ if (IS_ERR(bc->bdev))
+ goto out;
+ bc->pages = alloc_pages(GFP_KERNEL, 8);
+ if (!bc->pages)
+ goto out;
+ bc->zero_page = alloc_pages(GFP_KERNEL, 0);
+ if (!bc->zero_page)
+ goto out1;
+ bcon_init_bios(bc);
+ bcon_init_zero_bio(bc);
+ setup_timer(&bc->pad_timer, bcon_pad, (unsigned long)bc);
+ bc->max_bytes = bc->bdev->bd_inode->i_size & ~CACHE_MASK;
+ err = bcon_find_end_of_log(bc);
+ if (err)
+ goto out2;
+ kref_init(&bc->kref); /* This reference gets freed on errors */
+ bc->writeback_thread = kthread_run(bcon_writeback, bc, "bcon_%s",
+ devname);
+ if (IS_ERR(bc->writeback_thread))
+ goto out2;
+ INIT_WORK(&bc->unregister_work, bcon_unregister);
+ register_console(&bc->console);
+ bc->panic_block.notifier_call = blockconsole_panic;
+ bc->panic_block.priority = INT_MAX;
+ atomic_notifier_chain_register(&panic_notifier_list, &bc->panic_block);
+ pr_info("now logging to %s at %llx\n", devname, bc->console_bytes >> 20);
+
+ return 0;
+
+out2:
+ __free_pages(bc->zero_page, 0);
+out1:
+ __free_pages(bc->pages, 8);
+out:
+ kfree(bc);
+ /* Not strictly correct, be the caller doesn't care */
+ return -ENOMEM;
+}
+
+static void bcon_create_fuzzy(const char *name)
+{
+ char *longname;
+ int err;
+
+ err = bcon_create(name);
+ if (err) {
+ longname = kzalloc(strlen(name) + 6, GFP_KERNEL);
+ if (!longname)
+ return;
+ strcpy(longname, "/dev/");
+ strcat(longname, name);
+ bcon_create(longname);
+ kfree(longname);
+ }
+}
+
+static DEFINE_SPINLOCK(device_lock);
+static char scanned_devices[80];
+
+static void bcon_do_add(struct work_struct *work)
+{
+ char local_devices[80], *name, *remainder = local_devices;
+
+ spin_lock(&device_lock);
+ memcpy(local_devices, scanned_devices, sizeof(local_devices));
+ memset(scanned_devices, 0, sizeof(scanned_devices));
+ spin_unlock(&device_lock);
+
+ while (remainder && remainder[0]) {
+ name = strsep(&remainder, ",");
+ bcon_create_fuzzy(name);
+ }
+}
+
+DECLARE_WORK(bcon_add_work, bcon_do_add);
+
+void bcon_add(const char *name)
+{
+ /*
+ * We add each name to a small static buffer and ask for a workqueue
+ * to go pick it up asap. Once it is picked up, the buffer is empty
+ * again, so hopefully it will suffice for all sane users.
+ */
+ spin_lock(&device_lock);
+ if (scanned_devices[0])
+ strncat(scanned_devices, ",", sizeof(scanned_devices));
+ strncat(scanned_devices, name, sizeof(scanned_devices));
+ spin_unlock(&device_lock);
+ schedule_work(&bcon_add_work);
+}
+
+/*
+ * Check if we have an 8-digit hex number followed by newline
+ */
+static bool is_four_byte_hex(const void *data)
+{
+ const char *str = data;
+ int len = 0;
+
+ while (isxdigit(*str) && len++ < 9)
+ str++;
+
+ if (len != 8)
+ return false;
+
+ /* str should point to a \n now */
+ if (*str != 0xa)
+ return false;
+
+ return true;
+}
+
+int bcon_magic_present(const void *data)
+{
+ size_t len = strlen(BLOCKCONSOLE_MAGIC);
+
+ if (!memcmp(data, BLOCKCONSOLE_MAGIC_OLD, len))
+ return 10;
+ if (memcmp(data, BLOCKCONSOLE_MAGIC, len))
+ return 0;
+ if (!is_four_byte_hex(data + BCON_UUID_OFS))
+ return 0;
+ if (!is_four_byte_hex(data + BCON_ROUND_OFS))
+ return 0;
+ if (!is_four_byte_hex(data + BCON_TILE_OFS))
+ return 0;
+ return 11;
+}