From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 7 Nov 2011 02:27:35 +0000 (+1100)
Subject: Merge remote-tracking branch 'moduleh/for-sfr'
X-Git-Tag: next-20111107~3
X-Git-Url: https://git.karo-electronics.de/?a=commitdiff_plain;h=bb063136806b43ea90f746c16e531b72ed5de139;p=karo-tx-linux.git

Merge remote-tracking branch 'moduleh/for-sfr'

Conflicts:
	drivers/media/dvb/frontends/dibx000_common.c
	drivers/media/video/mt9m111.c
	drivers/media/video/ov6650.c
	drivers/mfd/ab3550-core.c
	include/linux/dmaengine.h
---

bb063136806b43ea90f746c16e531b72ed5de139
diff --cc arch/arm/mach-omap2/voltage.c
index 64070ac1e761,e964cfd3a3d0..1f8fdf736e63
--- a/arch/arm/mach-omap2/voltage.c
+++ b/arch/arm/mach-omap2/voltage.c
@@@ -21,10 -21,11 +21,11 @@@
  
  #include <linux/delay.h>
  #include <linux/io.h>
 -#include <linux/clk.h>
  #include <linux/err.h>
+ #include <linux/export.h>
  #include <linux/debugfs.h>
  #include <linux/slab.h>
 +#include <linux/clk.h>
  
  #include <plat/common.h>
  
diff --cc arch/arm/plat-samsung/dev-backlight.c
index 2adbeaed4c04,a976c023b286..e657305644cc
--- a/arch/arm/plat-samsung/dev-backlight.c
+++ b/arch/arm/plat-samsung/dev-backlight.c
@@@ -12,9 -12,9 +12,10 @@@
  
  #include <linux/gpio.h>
  #include <linux/platform_device.h>
+ #include <linux/slab.h>
  #include <linux/io.h>
  #include <linux/pwm_backlight.h>
 +#include <linux/slab.h>
  
  #include <plat/devs.h>
  #include <plat/gpio-cfg.h>
diff --cc arch/arm/plat-samsung/dma-ops.c
index 6e3d9abc9e2e,000000000000..93a994a5dd8f
mode 100644,000000..100644
--- a/arch/arm/plat-samsung/dma-ops.c
+++ b/arch/arm/plat-samsung/dma-ops.c
@@@ -1,131 -1,0 +1,132 @@@
 +/* linux/arch/arm/plat-samsung/dma-ops.c
 + *
 + * Copyright (c) 2011 Samsung Electronics Co., Ltd.
 + *		http://www.samsung.com
 + *
 + * Samsung DMA Operations
 + *
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License version 2 as
 + * published by the Free Software Foundation.
 + */
 +
 +#include <linux/kernel.h>
 +#include <linux/errno.h>
 +#include <linux/amba/pl330.h>
 +#include <linux/scatterlist.h>
++#include <linux/export.h>
 +
 +#include <mach/dma.h>
 +
 +static inline bool pl330_filter(struct dma_chan *chan, void *param)
 +{
 +	struct dma_pl330_peri *peri = chan->private;
 +	return peri->peri_id == (unsigned)param;
 +}
 +
 +static unsigned samsung_dmadev_request(enum dma_ch dma_ch,
 +				struct samsung_dma_info *info)
 +{
 +	struct dma_chan *chan;
 +	dma_cap_mask_t mask;
 +	struct dma_slave_config slave_config;
 +
 +	dma_cap_zero(mask);
 +	dma_cap_set(info->cap, mask);
 +
 +	chan = dma_request_channel(mask, pl330_filter, (void *)dma_ch);
 +
 +	if (info->direction == DMA_FROM_DEVICE) {
 +		memset(&slave_config, 0, sizeof(struct dma_slave_config));
 +		slave_config.direction = info->direction;
 +		slave_config.src_addr = info->fifo;
 +		slave_config.src_addr_width = info->width;
 +		slave_config.src_maxburst = 1;
 +		dmaengine_slave_config(chan, &slave_config);
 +	} else if (info->direction == DMA_TO_DEVICE) {
 +		memset(&slave_config, 0, sizeof(struct dma_slave_config));
 +		slave_config.direction = info->direction;
 +		slave_config.dst_addr = info->fifo;
 +		slave_config.dst_addr_width = info->width;
 +		slave_config.dst_maxburst = 1;
 +		dmaengine_slave_config(chan, &slave_config);
 +	}
 +
 +	return (unsigned)chan;
 +}
 +
 +static int samsung_dmadev_release(unsigned ch,
 +			struct s3c2410_dma_client *client)
 +{
 +	dma_release_channel((struct dma_chan *)ch);
 +
 +	return 0;
 +}
 +
 +static int samsung_dmadev_prepare(unsigned ch,
 +			struct samsung_dma_prep_info *info)
 +{
 +	struct scatterlist sg;
 +	struct dma_chan *chan = (struct dma_chan *)ch;
 +	struct dma_async_tx_descriptor *desc;
 +
 +	switch (info->cap) {
 +	case DMA_SLAVE:
 +		sg_init_table(&sg, 1);
 +		sg_dma_len(&sg) = info->len;
 +		sg_set_page(&sg, pfn_to_page(PFN_DOWN(info->buf)),
 +			    info->len, offset_in_page(info->buf));
 +		sg_dma_address(&sg) = info->buf;
 +
 +		desc = chan->device->device_prep_slave_sg(chan,
 +			&sg, 1, info->direction, DMA_PREP_INTERRUPT);
 +		break;
 +	case DMA_CYCLIC:
 +		desc = chan->device->device_prep_dma_cyclic(chan,
 +			info->buf, info->len, info->period, info->direction);
 +		break;
 +	default:
 +		dev_err(&chan->dev->device, "unsupported format\n");
 +		return -EFAULT;
 +	}
 +
 +	if (!desc) {
 +		dev_err(&chan->dev->device, "cannot prepare cyclic dma\n");
 +		return -EFAULT;
 +	}
 +
 +	desc->callback = info->fp;
 +	desc->callback_param = info->fp_param;
 +
 +	dmaengine_submit((struct dma_async_tx_descriptor *)desc);
 +
 +	return 0;
 +}
 +
 +static inline int samsung_dmadev_trigger(unsigned ch)
 +{
 +	dma_async_issue_pending((struct dma_chan *)ch);
 +
 +	return 0;
 +}
 +
 +static inline int samsung_dmadev_flush(unsigned ch)
 +{
 +	return dmaengine_terminate_all((struct dma_chan *)ch);
 +}
 +
 +struct samsung_dma_ops dmadev_ops = {
 +	.request	= samsung_dmadev_request,
 +	.release	= samsung_dmadev_release,
 +	.prepare	= samsung_dmadev_prepare,
 +	.trigger	= samsung_dmadev_trigger,
 +	.started	= NULL,
 +	.flush		= samsung_dmadev_flush,
 +	.stop		= samsung_dmadev_flush,
 +};
 +
 +void *samsung_dmadev_get_ops(void)
 +{
 +	return &dmadev_ops;
 +}
 +EXPORT_SYMBOL(samsung_dmadev_get_ops);
diff --cc arch/arm/plat-samsung/s3c-dma-ops.c
index 582333c70585,000000000000..781494912827
mode 100644,000000..100644
--- a/arch/arm/plat-samsung/s3c-dma-ops.c
+++ b/arch/arm/plat-samsung/s3c-dma-ops.c
@@@ -1,130 -1,0 +1,131 @@@
 +/* linux/arch/arm/plat-samsung/s3c-dma-ops.c
 + *
 + * Copyright (c) 2011 Samsung Electronics Co., Ltd.
 + *		http://www.samsung.com
 + *
 + * Samsung S3C-DMA Operations
 + *
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License version 2 as
 + * published by the Free Software Foundation.
 + */
 +
 +#include <linux/kernel.h>
 +#include <linux/errno.h>
 +#include <linux/slab.h>
 +#include <linux/types.h>
++#include <linux/export.h>
 +
 +#include <mach/dma.h>
 +
 +struct cb_data {
 +	void (*fp) (void *);
 +	void *fp_param;
 +	unsigned ch;
 +	struct list_head node;
 +};
 +
 +static LIST_HEAD(dma_list);
 +
 +static void s3c_dma_cb(struct s3c2410_dma_chan *channel, void *param,
 +		       int size, enum s3c2410_dma_buffresult res)
 +{
 +	struct cb_data *data = param;
 +
 +	data->fp(data->fp_param);
 +}
 +
 +static unsigned s3c_dma_request(enum dma_ch dma_ch,
 +				 struct samsung_dma_info *info)
 +{
 +	struct cb_data *data;
 +
 +	if (s3c2410_dma_request(dma_ch, info->client, NULL) < 0) {
 +		s3c2410_dma_free(dma_ch, info->client);
 +		return 0;
 +	}
 +
 +	data = kzalloc(sizeof(struct cb_data), GFP_KERNEL);
 +	data->ch = dma_ch;
 +	list_add_tail(&data->node, &dma_list);
 +
 +	s3c2410_dma_devconfig(dma_ch, info->direction, info->fifo);
 +
 +	if (info->cap == DMA_CYCLIC)
 +		s3c2410_dma_setflags(dma_ch, S3C2410_DMAF_CIRCULAR);
 +
 +	s3c2410_dma_config(dma_ch, info->width);
 +
 +	return (unsigned)dma_ch;
 +}
 +
 +static int s3c_dma_release(unsigned ch, struct s3c2410_dma_client *client)
 +{
 +	struct cb_data *data;
 +
 +	list_for_each_entry(data, &dma_list, node)
 +		if (data->ch == ch)
 +			break;
 +	list_del(&data->node);
 +
 +	s3c2410_dma_free(ch, client);
 +	kfree(data);
 +
 +	return 0;
 +}
 +
 +static int s3c_dma_prepare(unsigned ch, struct samsung_dma_prep_info *info)
 +{
 +	struct cb_data *data;
 +	int len = (info->cap == DMA_CYCLIC) ? info->period : info->len;
 +
 +	list_for_each_entry(data, &dma_list, node)
 +		if (data->ch == ch)
 +			break;
 +
 +	if (!data->fp) {
 +		s3c2410_dma_set_buffdone_fn(ch, s3c_dma_cb);
 +		data->fp = info->fp;
 +		data->fp_param = info->fp_param;
 +	}
 +
 +	s3c2410_dma_enqueue(ch, (void *)data, info->buf, len);
 +
 +	return 0;
 +}
 +
 +static inline int s3c_dma_trigger(unsigned ch)
 +{
 +	return s3c2410_dma_ctrl(ch, S3C2410_DMAOP_START);
 +}
 +
 +static inline int s3c_dma_started(unsigned ch)
 +{
 +	return s3c2410_dma_ctrl(ch, S3C2410_DMAOP_STARTED);
 +}
 +
 +static inline int s3c_dma_flush(unsigned ch)
 +{
 +	return s3c2410_dma_ctrl(ch, S3C2410_DMAOP_FLUSH);
 +}
 +
 +static inline int s3c_dma_stop(unsigned ch)
 +{
 +	return s3c2410_dma_ctrl(ch, S3C2410_DMAOP_STOP);
 +}
 +
 +static struct samsung_dma_ops s3c_dma_ops = {
 +	.request	= s3c_dma_request,
 +	.release	= s3c_dma_release,
 +	.prepare	= s3c_dma_prepare,
 +	.trigger	= s3c_dma_trigger,
 +	.started	= s3c_dma_started,
 +	.flush		= s3c_dma_flush,
 +	.stop		= s3c_dma_stop,
 +};
 +
 +void *s3c_dma_get_ops(void)
 +{
 +	return &s3c_dma_ops;
 +}
 +EXPORT_SYMBOL(s3c_dma_get_ops);
diff --cc arch/microblaze/kernel/dma.c
index dc6416d265d6,b159b8a847d6..65a4af4cbbbe
--- a/arch/microblaze/kernel/dma.c
+++ b/arch/microblaze/kernel/dma.c
@@@ -10,7 -10,9 +10,8 @@@
  #include <linux/dma-mapping.h>
  #include <linux/gfp.h>
  #include <linux/dma-debug.h>
+ #include <linux/export.h>
  #include <asm/bug.h>
 -#include <asm/cacheflush.h>
  
  /*
   * Generic direct DMA implementation
diff --cc arch/sh/kernel/topology.c
index ecc2d3d0f54a,ab37955b453a..4649a6ff0cfe
--- a/arch/sh/kernel/topology.c
+++ b/arch/sh/kernel/topology.c
@@@ -11,9 -11,9 +11,10 @@@
  #include <linux/cpumask.h>
  #include <linux/init.h>
  #include <linux/percpu.h>
 +#include <linux/topology.h>
  #include <linux/node.h>
  #include <linux/nodemask.h>
+ #include <linux/export.h>
  
  static DEFINE_PER_CPU(struct cpu, cpu_devices);
  
diff --cc arch/x86/crypto/aes_glue.c
index b0b6950cc8c8,bdce3eeeaa37..8efcf42a9d7e
--- a/arch/x86/crypto/aes_glue.c
+++ b/arch/x86/crypto/aes_glue.c
@@@ -3,8 -3,8 +3,9 @@@
   *
   */
  
+ #include <linux/module.h>
  #include <crypto/aes.h>
 +#include <asm/aes.h>
  
  asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
  asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
diff --cc arch/x86/kernel/cpu/mcheck/mce.c
index 864830e1dd65,537c89e00095..362056aefeb4
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@@ -36,7 -36,9 +36,8 @@@
  #include <linux/fs.h>
  #include <linux/mm.h>
  #include <linux/debugfs.h>
 -#include <linux/edac_mce.h>
  #include <linux/irq_work.h>
+ #include <linux/export.h>
  
  #include <asm/processor.h>
  #include <asm/mce.h>
diff --cc drivers/infiniband/core/verbs.c
index 42517500b223,a8923ffc6459..602b1bd723a9
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@@ -38,8 -38,8 +38,9 @@@
  
  #include <linux/errno.h>
  #include <linux/err.h>
+ #include <linux/export.h>
  #include <linux/string.h>
 +#include <linux/slab.h>
  
  #include <rdma/ib_verbs.h>
  #include <rdma/ib_cache.h>
diff --cc drivers/md/dm-bufio.c
index cb246667dd52,000000000000..0a6806f80ab5
mode 100644,000000..100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@@ -1,1699 -1,0 +1,1700 @@@
 +/*
 + * Copyright (C) 2009-2011 Red Hat, Inc.
 + *
 + * Author: Mikulas Patocka <mpatocka@redhat.com>
 + *
 + * This file is released under the GPL.
 + */
 +
 +#include "dm-bufio.h"
 +
 +#include <linux/device-mapper.h>
 +#include <linux/dm-io.h>
 +#include <linux/slab.h>
 +#include <linux/vmalloc.h>
 +#include <linux/version.h>
 +#include <linux/shrinker.h>
++#include <linux/module.h>
 +
 +#define DM_MSG_PREFIX "bufio"
 +
 +/*
 + * Memory management policy:
 + *	Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
 + *	or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
 + *	Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
 + *	Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
 + *	dirty buffers.
 + */
 +#define DM_BUFIO_MIN_BUFFERS		8
 +
 +#define DM_BUFIO_MEMORY_PERCENT		2
 +#define DM_BUFIO_VMALLOC_PERCENT	25
 +#define DM_BUFIO_WRITEBACK_PERCENT	75
 +
 +/*
 + * Check buffer ages in this interval (seconds)
 + */
 +#define DM_BUFIO_WORK_TIMER_SECS	10
 +
 +/*
 + * Free buffers when they are older than this (seconds)
 + */
 +#define DM_BUFIO_DEFAULT_AGE_SECS	60
 +
 +/*
 + * The number of bvec entries that are embedded directly in the buffer.
 + * If the chunk size is larger, dm-io is used to do the io.
 + */
 +#define DM_BUFIO_INLINE_VECS		16
 +
 +/*
 + * Buffer hash
 + */
 +#define DM_BUFIO_HASH_BITS	20
 +#define DM_BUFIO_HASH(block) \
 +	((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \
 +	 ((1 << DM_BUFIO_HASH_BITS) - 1))
 +
 +/*
 + * Don't try to use kmem_cache_alloc for blocks larger than this.
 + * For explanation, see alloc_buffer_data below.
 + */
 +#define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT	(PAGE_SIZE >> 1)
 +#define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT	(PAGE_SIZE << (MAX_ORDER - 1))
 +
 +/*
 + * dm_buffer->list_mode
 + */
 +#define LIST_CLEAN	0
 +#define LIST_DIRTY	1
 +#define LIST_SIZE	2
 +
 +/*
 + * Linking of buffers:
 + *	All buffers are linked to cache_hash with their hash_list field.
 + *
 + *	Clean buffers that are not being written (B_WRITING not set)
 + *	are linked to lru[LIST_CLEAN] with their lru_list field.
 + *
 + *	Dirty and clean buffers that are being written are linked to
 + *	lru[LIST_DIRTY] with their lru_list field. When the write
 + *	finishes, the buffer cannot be relinked immediately (because we
 + *	are in an interrupt context and relinking requires process
 + *	context), so some clean-not-writing buffers can be held on
 + *	dirty_lru too.  They are later added to lru in the process
 + *	context.
 + */
 +struct dm_bufio_client {
 +	struct mutex lock;
 +
 +	struct list_head lru[LIST_SIZE];
 +	unsigned long n_buffers[LIST_SIZE];
 +
 +	struct block_device *bdev;
 +	unsigned block_size;
 +	unsigned char sectors_per_block_bits;
 +	unsigned char pages_per_block_bits;
 +	unsigned char blocks_per_page_bits;
 +	unsigned aux_size;
 +	void (*alloc_callback)(struct dm_buffer *);
 +	void (*write_callback)(struct dm_buffer *);
 +
 +	struct dm_io_client *dm_io;
 +
 +	struct list_head reserved_buffers;
 +	unsigned need_reserved_buffers;
 +
 +	struct hlist_head *cache_hash;
 +	wait_queue_head_t free_buffer_wait;
 +
 +	int async_write_error;
 +
 +	struct list_head client_list;
 +	struct shrinker shrinker;
 +};
 +
 +/*
 + * Buffer state bits.
 + */
 +#define B_READING	0
 +#define B_WRITING	1
 +#define B_DIRTY		2
 +
 +/*
 + * Describes how the block was allocated:
 + * kmem_cache_alloc(), __get_free_pages() or vmalloc().
 + * See the comment at alloc_buffer_data.
 + */
 +enum data_mode {
 +	DATA_MODE_SLAB = 0,
 +	DATA_MODE_GET_FREE_PAGES = 1,
 +	DATA_MODE_VMALLOC = 2,
 +	DATA_MODE_LIMIT = 3
 +};
 +
 +struct dm_buffer {
 +	struct hlist_node hash_list;
 +	struct list_head lru_list;
 +	sector_t block;
 +	void *data;
 +	enum data_mode data_mode;
 +	unsigned char list_mode;		/* LIST_* */
 +	unsigned hold_count;
 +	int read_error;
 +	int write_error;
 +	unsigned long state;
 +	unsigned long last_accessed;
 +	struct dm_bufio_client *c;
 +	struct bio bio;
 +	struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
 +};
 +
 +/*----------------------------------------------------------------*/
 +
 +static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT];
 +static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT];
 +
 +static inline int dm_bufio_cache_index(struct dm_bufio_client *c)
 +{
 +	unsigned ret = c->blocks_per_page_bits - 1;
 +
 +	BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches));
 +
 +	return ret;
 +}
 +
 +#define DM_BUFIO_CACHE(c)	(dm_bufio_caches[dm_bufio_cache_index(c)])
 +#define DM_BUFIO_CACHE_NAME(c)	(dm_bufio_cache_names[dm_bufio_cache_index(c)])
 +
 +#define dm_bufio_in_request()	(!!current->bio_list)
 +
 +static void dm_bufio_lock(struct dm_bufio_client *c)
 +{
 +	mutex_lock_nested(&c->lock, dm_bufio_in_request());
 +}
 +
 +static int dm_bufio_trylock(struct dm_bufio_client *c)
 +{
 +	return mutex_trylock(&c->lock);
 +}
 +
 +static void dm_bufio_unlock(struct dm_bufio_client *c)
 +{
 +	mutex_unlock(&c->lock);
 +}
 +
 +/*
 + * FIXME Move to sched.h?
 + */
 +#ifdef CONFIG_PREEMPT_VOLUNTARY
 +#  define dm_bufio_cond_resched()		\
 +do {						\
 +	if (unlikely(need_resched()))		\
 +		_cond_resched();		\
 +} while (0)
 +#else
 +#  define dm_bufio_cond_resched()                do { } while (0)
 +#endif
 +
 +/*----------------------------------------------------------------*/
 +
 +/*
 + * Default cache size: available memory divided by the ratio.
 + */
 +static unsigned long dm_bufio_default_cache_size;
 +
 +/*
 + * Total cache size set by the user.
 + */
 +static unsigned long dm_bufio_cache_size;
 +
 +/*
 + * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
 + * at any time.  If it disagrees, the user has changed cache size.
 + */
 +static unsigned long dm_bufio_cache_size_latch;
 +
 +static DEFINE_SPINLOCK(param_spinlock);
 +
 +/*
 + * Buffers are freed after this timeout
 + */
 +static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
 +
 +static unsigned long dm_bufio_peak_allocated;
 +static unsigned long dm_bufio_allocated_kmem_cache;
 +static unsigned long dm_bufio_allocated_get_free_pages;
 +static unsigned long dm_bufio_allocated_vmalloc;
 +static unsigned long dm_bufio_current_allocated;
 +
 +/*----------------------------------------------------------------*/
 +
 +/*
 + * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count
 + */
 +static unsigned long dm_bufio_cache_size_per_client;
 +
 +/*
 + * The current number of clients.
 + */
 +static int dm_bufio_client_count;
 +
 +/*
 + * The list of all clients.
 + */
 +static LIST_HEAD(dm_bufio_all_clients);
 +
 +/*
 + * This mutex protects dm_bufio_cache_size_latch,
 + * dm_bufio_cache_size_per_client and dm_bufio_client_count
 + */
 +static DEFINE_MUTEX(dm_bufio_clients_lock);
 +
 +/*----------------------------------------------------------------*/
 +
 +static void adjust_total_allocated(enum data_mode data_mode, long diff)
 +{
 +	static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
 +		&dm_bufio_allocated_kmem_cache,
 +		&dm_bufio_allocated_get_free_pages,
 +		&dm_bufio_allocated_vmalloc,
 +	};
 +
 +	spin_lock(&param_spinlock);
 +
 +	*class_ptr[data_mode] += diff;
 +
 +	dm_bufio_current_allocated += diff;
 +
 +	if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
 +		dm_bufio_peak_allocated = dm_bufio_current_allocated;
 +
 +	spin_unlock(&param_spinlock);
 +}
 +
 +/*
 + * Change the number of clients and recalculate per-client limit.
 + */
 +static void __cache_size_refresh(void)
 +{
 +	BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
 +	BUG_ON(dm_bufio_client_count < 0);
 +
 +	dm_bufio_cache_size_latch = dm_bufio_cache_size;
 +
 +	barrier();
 +
 +	/*
 +	 * Use default if set to 0 and report the actual cache size used.
 +	 */
 +	if (!dm_bufio_cache_size_latch) {
 +		(void)cmpxchg(&dm_bufio_cache_size, 0,
 +			      dm_bufio_default_cache_size);
 +		dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
 +	}
 +
 +	dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch /
 +					 (dm_bufio_client_count ? : 1);
 +}
 +
 +/*
 + * Allocating buffer data.
 + *
 + * Small buffers are allocated with kmem_cache, to use space optimally.
 + *
 + * For large buffers, we choose between get_free_pages and vmalloc.
 + * Each has advantages and disadvantages.
 + *
 + * __get_free_pages can randomly fail if the memory is fragmented.
 + * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
 + * as low as 128M) so using it for caching is not appropriate.
 + *
 + * If the allocation may fail we use __get_free_pages. Memory fragmentation
 + * won't have a fatal effect here, but it just causes flushes of some other
 + * buffers and more I/O will be performed. Don't use __get_free_pages if it
 + * always fails (i.e. order >= MAX_ORDER).
 + *
 + * If the allocation shouldn't fail we use __vmalloc. This is only for the
 + * initial reserve allocation, so there's no risk of wasting all vmalloc
 + * space.
 + */
 +static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
 +			       enum data_mode *data_mode)
 +{
 +	if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) {
 +		*data_mode = DATA_MODE_SLAB;
 +		return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask);
 +	}
 +
 +	if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT &&
 +	    gfp_mask & __GFP_NORETRY) {
 +		*data_mode = DATA_MODE_GET_FREE_PAGES;
 +		return (void *)__get_free_pages(gfp_mask,
 +						c->pages_per_block_bits);
 +	}
 +
 +	*data_mode = DATA_MODE_VMALLOC;
 +	return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
 +}
 +
 +/*
 + * Free buffer's data.
 + */
 +static void free_buffer_data(struct dm_bufio_client *c,
 +			     void *data, enum data_mode data_mode)
 +{
 +	switch (data_mode) {
 +	case DATA_MODE_SLAB:
 +		kmem_cache_free(DM_BUFIO_CACHE(c), data);
 +		break;
 +
 +	case DATA_MODE_GET_FREE_PAGES:
 +		free_pages((unsigned long)data, c->pages_per_block_bits);
 +		break;
 +
 +	case DATA_MODE_VMALLOC:
 +		vfree(data);
 +		break;
 +
 +	default:
 +		DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
 +		       data_mode);
 +		BUG();
 +	}
 +}
 +
 +/*
 + * Allocate buffer and its data.
 + */
 +static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
 +{
 +	struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size,
 +				      gfp_mask);
 +
 +	if (!b)
 +		return NULL;
 +
 +	b->c = c;
 +
 +	b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
 +	if (!b->data) {
 +		kfree(b);
 +		return NULL;
 +	}
 +
 +	adjust_total_allocated(b->data_mode, (long)c->block_size);
 +
 +	return b;
 +}
 +
 +/*
 + * Free buffer and its data.
 + */
 +static void free_buffer(struct dm_buffer *b)
 +{
 +	struct dm_bufio_client *c = b->c;
 +
 +	adjust_total_allocated(b->data_mode, -(long)c->block_size);
 +
 +	free_buffer_data(c, b->data, b->data_mode);
 +	kfree(b);
 +}
 +
 +/*
 + * Link buffer to the hash list and clean or dirty queue.
 + */
 +static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
 +{
 +	struct dm_bufio_client *c = b->c;
 +
 +	c->n_buffers[dirty]++;
 +	b->block = block;
 +	b->list_mode = dirty;
 +	list_add(&b->lru_list, &c->lru[dirty]);
 +	hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]);
 +	b->last_accessed = jiffies;
 +}
 +
 +/*
 + * Unlink buffer from the hash list and dirty or clean queue.
 + */
 +static void __unlink_buffer(struct dm_buffer *b)
 +{
 +	struct dm_bufio_client *c = b->c;
 +
 +	BUG_ON(!c->n_buffers[b->list_mode]);
 +
 +	c->n_buffers[b->list_mode]--;
 +	hlist_del(&b->hash_list);
 +	list_del(&b->lru_list);
 +}
 +
 +/*
 + * Place the buffer to the head of dirty or clean LRU queue.
 + */
 +static void __relink_lru(struct dm_buffer *b, int dirty)
 +{
 +	struct dm_bufio_client *c = b->c;
 +
 +	BUG_ON(!c->n_buffers[b->list_mode]);
 +
 +	c->n_buffers[b->list_mode]--;
 +	c->n_buffers[dirty]++;
 +	b->list_mode = dirty;
 +	list_del(&b->lru_list);
 +	list_add(&b->lru_list, &c->lru[dirty]);
 +}
 +
 +/*----------------------------------------------------------------
 + * Submit I/O on the buffer.
 + *
 + * Bio interface is faster but it has some problems:
 + *	the vector list is limited (increasing this limit increases
 + *	memory-consumption per buffer, so it is not viable);
 + *
 + *	the memory must be direct-mapped, not vmalloced;
 + *
 + *	the I/O driver can reject requests spuriously if it thinks that
 + *	the requests are too big for the device or if they cross a
 + *	controller-defined memory boundary.
 + *
 + * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
 + * it is not vmalloced, try using the bio interface.
 + *
 + * If the buffer is big, if it is vmalloced or if the underlying device
 + * rejects the bio because it is too large, use dm-io layer to do the I/O.
 + * The dm-io layer splits the I/O into multiple requests, avoiding the above
 + * shortcomings.
 + *--------------------------------------------------------------*/
 +
 +/*
 + * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
 + * that the request was handled directly with bio interface.
 + */
 +static void dmio_complete(unsigned long error, void *context)
 +{
 +	struct dm_buffer *b = context;
 +
 +	b->bio.bi_end_io(&b->bio, error ? -EIO : 0);
 +}
 +
 +static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
 +		     bio_end_io_t *end_io)
 +{
 +	int r;
 +	struct dm_io_request io_req = {
 +		.bi_rw = rw,
 +		.notify.fn = dmio_complete,
 +		.notify.context = b,
 +		.client = b->c->dm_io,
 +	};
 +	struct dm_io_region region = {
 +		.bdev = b->c->bdev,
 +		.sector = block << b->c->sectors_per_block_bits,
 +		.count = b->c->block_size >> SECTOR_SHIFT,
 +	};
 +
 +	if (b->data_mode != DATA_MODE_VMALLOC) {
 +		io_req.mem.type = DM_IO_KMEM;
 +		io_req.mem.ptr.addr = b->data;
 +	} else {
 +		io_req.mem.type = DM_IO_VMA;
 +		io_req.mem.ptr.vma = b->data;
 +	}
 +
 +	b->bio.bi_end_io = end_io;
 +
 +	r = dm_io(&io_req, 1, &region, NULL);
 +	if (r)
 +		end_io(&b->bio, r);
 +}
 +
 +static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 +			   bio_end_io_t *end_io)
 +{
 +	char *ptr;
 +	int len;
 +
 +	bio_init(&b->bio);
 +	b->bio.bi_io_vec = b->bio_vec;
 +	b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
 +	b->bio.bi_sector = block << b->c->sectors_per_block_bits;
 +	b->bio.bi_bdev = b->c->bdev;
 +	b->bio.bi_end_io = end_io;
 +
 +	/*
 +	 * We assume that if len >= PAGE_SIZE ptr is page-aligned.
 +	 * If len < PAGE_SIZE the buffer doesn't cross page boundary.
 +	 */
 +	ptr = b->data;
 +	len = b->c->block_size;
 +
 +	if (len >= PAGE_SIZE)
 +		BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1));
 +	else
 +		BUG_ON((unsigned long)ptr & (len - 1));
 +
 +	do {
 +		if (!bio_add_page(&b->bio, virt_to_page(ptr),
 +				  len < PAGE_SIZE ? len : PAGE_SIZE,
 +				  virt_to_phys(ptr) & (PAGE_SIZE - 1))) {
 +			BUG_ON(b->c->block_size <= PAGE_SIZE);
 +			use_dmio(b, rw, block, end_io);
 +			return;
 +		}
 +
 +		len -= PAGE_SIZE;
 +		ptr += PAGE_SIZE;
 +	} while (len > 0);
 +
 +	submit_bio(rw, &b->bio);
 +}
 +
 +static void submit_io(struct dm_buffer *b, int rw, sector_t block,
 +		      bio_end_io_t *end_io)
 +{
 +	if (rw == WRITE && b->c->write_callback)
 +		b->c->write_callback(b);
 +
 +	if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE &&
 +	    b->data_mode != DATA_MODE_VMALLOC)
 +		use_inline_bio(b, rw, block, end_io);
 +	else
 +		use_dmio(b, rw, block, end_io);
 +}
 +
 +/*----------------------------------------------------------------
 + * Writing dirty buffers
 + *--------------------------------------------------------------*/
 +
 +/*
 + * The endio routine for write.
 + *
 + * Set the error, clear B_WRITING bit and wake anyone who was waiting on
 + * it.
 + */
 +static void write_endio(struct bio *bio, int error)
 +{
 +	struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
 +
 +	b->write_error = error;
 +	if (error) {
 +		struct dm_bufio_client *c = b->c;
 +		(void)cmpxchg(&c->async_write_error, 0, error);
 +	}
 +
 +	BUG_ON(!test_bit(B_WRITING, &b->state));
 +
 +	smp_mb__before_clear_bit();
 +	clear_bit(B_WRITING, &b->state);
 +	smp_mb__after_clear_bit();
 +
 +	wake_up_bit(&b->state, B_WRITING);
 +}
 +
 +/*
 + * This function is called when wait_on_bit is actually waiting.
 + */
 +static int do_io_schedule(void *word)
 +{
 +	io_schedule();
 +
 +	return 0;
 +}
 +
 +/*
 + * Initiate a write on a dirty buffer, but don't wait for it.
 + *
 + * - If the buffer is not dirty, exit.
 + * - If there some previous write going on, wait for it to finish (we can't
 + *   have two writes on the same buffer simultaneously).
 + * - Submit our write and don't wait on it. We set B_WRITING indicating
 + *   that there is a write in progress.
 + */
 +static void __write_dirty_buffer(struct dm_buffer *b)
 +{
 +	if (!test_bit(B_DIRTY, &b->state))
 +		return;
 +
 +	clear_bit(B_DIRTY, &b->state);
 +	wait_on_bit_lock(&b->state, B_WRITING,
 +			 do_io_schedule, TASK_UNINTERRUPTIBLE);
 +
 +	submit_io(b, WRITE, b->block, write_endio);
 +}
 +
 +/*
 + * Wait until any activity on the buffer finishes.  Possibly write the
 + * buffer if it is dirty.  When this function finishes, there is no I/O
 + * running on the buffer and the buffer is not dirty.
 + */
 +static void __make_buffer_clean(struct dm_buffer *b)
 +{
 +	BUG_ON(b->hold_count);
 +
 +	if (!b->state)	/* fast case */
 +		return;
 +
 +	wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
 +	__write_dirty_buffer(b);
 +	wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE);
 +}
 +
 +/*
 + * Find some buffer that is not held by anybody, clean it, unlink it and
 + * return it.
 + */
 +static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
 +{
 +	struct dm_buffer *b;
 +
 +	list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
 +		BUG_ON(test_bit(B_WRITING, &b->state));
 +		BUG_ON(test_bit(B_DIRTY, &b->state));
 +
 +		if (!b->hold_count) {
 +			__make_buffer_clean(b);
 +			__unlink_buffer(b);
 +			return b;
 +		}
 +		dm_bufio_cond_resched();
 +	}
 +
 +	list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
 +		BUG_ON(test_bit(B_READING, &b->state));
 +
 +		if (!b->hold_count) {
 +			__make_buffer_clean(b);
 +			__unlink_buffer(b);
 +			return b;
 +		}
 +		dm_bufio_cond_resched();
 +	}
 +
 +	return NULL;
 +}
 +
 +/*
 + * Wait until some other threads free some buffer or release hold count on
 + * some buffer.
 + *
 + * This function is entered with c->lock held, drops it and regains it
 + * before exiting.
 + */
 +static void __wait_for_free_buffer(struct dm_bufio_client *c)
 +{
 +	DECLARE_WAITQUEUE(wait, current);
 +
 +	add_wait_queue(&c->free_buffer_wait, &wait);
 +	set_task_state(current, TASK_UNINTERRUPTIBLE);
 +	dm_bufio_unlock(c);
 +
 +	io_schedule();
 +
 +	set_task_state(current, TASK_RUNNING);
 +	remove_wait_queue(&c->free_buffer_wait, &wait);
 +
 +	dm_bufio_lock(c);
 +}
 +
 +/*
 + * Allocate a new buffer. If the allocation is not possible, wait until
 + * some other thread frees a buffer.
 + *
 + * May drop the lock and regain it.
 + */
 +static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c)
 +{
 +	struct dm_buffer *b;
 +
 +	/*
 +	 * dm-bufio is resistant to allocation failures (it just keeps
 +	 * one buffer reserved in cases all the allocations fail).
 +	 * So set flags to not try too hard:
 +	 *	GFP_NOIO: don't recurse into the I/O layer
 +	 *	__GFP_NORETRY: don't retry and rather return failure
 +	 *	__GFP_NOMEMALLOC: don't use emergency reserves
 +	 *	__GFP_NOWARN: don't print a warning in case of failure
 +	 *
 +	 * For debugging, if we set the cache size to 1, no new buffers will
 +	 * be allocated.
 +	 */
 +	while (1) {
 +		if (dm_bufio_cache_size_latch != 1) {
 +			b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
 +			if (b)
 +				return b;
 +		}
 +
 +		if (!list_empty(&c->reserved_buffers)) {
 +			b = list_entry(c->reserved_buffers.next,
 +				       struct dm_buffer, lru_list);
 +			list_del(&b->lru_list);
 +			c->need_reserved_buffers++;
 +
 +			return b;
 +		}
 +
 +		b = __get_unclaimed_buffer(c);
 +		if (b)
 +			return b;
 +
 +		__wait_for_free_buffer(c);
 +	}
 +}
 +
 +static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c)
 +{
 +	struct dm_buffer *b = __alloc_buffer_wait_no_callback(c);
 +
 +	if (c->alloc_callback)
 +		c->alloc_callback(b);
 +
 +	return b;
 +}
 +
 +/*
 + * Free a buffer and wake other threads waiting for free buffers.
 + */
 +static void __free_buffer_wake(struct dm_buffer *b)
 +{
 +	struct dm_bufio_client *c = b->c;
 +
 +	if (!c->need_reserved_buffers)
 +		free_buffer(b);
 +	else {
 +		list_add(&b->lru_list, &c->reserved_buffers);
 +		c->need_reserved_buffers--;
 +	}
 +
 +	wake_up(&c->free_buffer_wait);
 +}
 +
 +static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait)
 +{
 +	struct dm_buffer *b, *tmp;
 +
 +	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
 +		BUG_ON(test_bit(B_READING, &b->state));
 +
 +		if (!test_bit(B_DIRTY, &b->state) &&
 +		    !test_bit(B_WRITING, &b->state)) {
 +			__relink_lru(b, LIST_CLEAN);
 +			continue;
 +		}
 +
 +		if (no_wait && test_bit(B_WRITING, &b->state))
 +			return;
 +
 +		__write_dirty_buffer(b);
 +		dm_bufio_cond_resched();
 +	}
 +}
 +
 +/*
 + * Get writeback threshold and buffer limit for a given client.
 + */
 +static void __get_memory_limit(struct dm_bufio_client *c,
 +			       unsigned long *threshold_buffers,
 +			       unsigned long *limit_buffers)
 +{
 +	unsigned long buffers;
 +
 +	if (dm_bufio_cache_size != dm_bufio_cache_size_latch) {
 +		mutex_lock(&dm_bufio_clients_lock);
 +		__cache_size_refresh();
 +		mutex_unlock(&dm_bufio_clients_lock);
 +	}
 +
 +	buffers = dm_bufio_cache_size_per_client >>
 +		  (c->sectors_per_block_bits + SECTOR_SHIFT);
 +
 +	if (buffers < DM_BUFIO_MIN_BUFFERS)
 +		buffers = DM_BUFIO_MIN_BUFFERS;
 +
 +	*limit_buffers = buffers;
 +	*threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100;
 +}
 +
 +/*
 + * Check if we're over watermark.
 + * If we are over threshold_buffers, start freeing buffers.
 + * If we're over "limit_buffers", block until we get under the limit.
 + */
 +static void __check_watermark(struct dm_bufio_client *c)
 +{
 +	unsigned long threshold_buffers, limit_buffers;
 +
 +	__get_memory_limit(c, &threshold_buffers, &limit_buffers);
 +
 +	while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] >
 +	       limit_buffers) {
 +
 +		struct dm_buffer *b = __get_unclaimed_buffer(c);
 +
 +		if (!b)
 +			return;
 +
 +		__free_buffer_wake(b);
 +		dm_bufio_cond_resched();
 +	}
 +
 +	if (c->n_buffers[LIST_DIRTY] > threshold_buffers)
 +		__write_dirty_buffers_async(c, 1);
 +}
 +
 +/*
 + * Find a buffer in the hash.
 + */
 +static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
 +{
 +	struct dm_buffer *b;
 +	struct hlist_node *hn;
 +
 +	hlist_for_each_entry(b, hn, &c->cache_hash[DM_BUFIO_HASH(block)],
 +			     hash_list) {
 +		dm_bufio_cond_resched();
 +		if (b->block == block)
 +			return b;
 +	}
 +
 +	return NULL;
 +}
 +
 +/*----------------------------------------------------------------
 + * Getting a buffer
 + *--------------------------------------------------------------*/
 +
 +enum new_flag {
 +	NF_FRESH = 0,
 +	NF_READ = 1,
 +	NF_GET = 2
 +};
 +
 +static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
 +				     enum new_flag nf, struct dm_buffer **bp,
 +				     int *need_submit)
 +{
 +	struct dm_buffer *b, *new_b = NULL;
 +
 +	*need_submit = 0;
 +
 +	b = __find(c, block);
 +	if (b) {
 +		b->hold_count++;
 +		__relink_lru(b, test_bit(B_DIRTY, &b->state) ||
 +			     test_bit(B_WRITING, &b->state));
 +		return b;
 +	}
 +
 +	if (nf == NF_GET)
 +		return NULL;
 +
 +	new_b = __alloc_buffer_wait(c);
 +
 +	/*
 +	 * We've had a period where the mutex was unlocked, so need to
 +	 * recheck the hash table.
 +	 */
 +	b = __find(c, block);
 +	if (b) {
 +		__free_buffer_wake(new_b);
 +		b->hold_count++;
 +		__relink_lru(b, test_bit(B_DIRTY, &b->state) ||
 +			     test_bit(B_WRITING, &b->state));
 +		return b;
 +	}
 +
 +	__check_watermark(c);
 +
 +	b = new_b;
 +	b->hold_count = 1;
 +	b->read_error = 0;
 +	b->write_error = 0;
 +	__link_buffer(b, block, LIST_CLEAN);
 +
 +	if (nf == NF_FRESH) {
 +		b->state = 0;
 +		return b;
 +	}
 +
 +	b->state = 1 << B_READING;
 +	*need_submit = 1;
 +
 +	return b;
 +}
 +
 +/*
 + * The endio routine for reading: set the error, clear the bit and wake up
 + * anyone waiting on the buffer.
 + */
 +static void read_endio(struct bio *bio, int error)
 +{
 +	struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
 +
 +	b->read_error = error;
 +
 +	BUG_ON(!test_bit(B_READING, &b->state));
 +
 +	smp_mb__before_clear_bit();
 +	clear_bit(B_READING, &b->state);
 +	smp_mb__after_clear_bit();
 +
 +	wake_up_bit(&b->state, B_READING);
 +}
 +
 +/*
 + * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these
 + * functions is similar except that dm_bufio_new doesn't read the
 + * buffer from the disk (assuming that the caller overwrites all the data
 + * and uses dm_bufio_mark_buffer_dirty to write new data back).
 + */
 +static void *new_read(struct dm_bufio_client *c, sector_t block,
 +		      enum new_flag nf, struct dm_buffer **bp)
 +{
 +	int need_submit;
 +	struct dm_buffer *b;
 +
 +	dm_bufio_lock(c);
 +	b = __bufio_new(c, block, nf, bp, &need_submit);
 +	dm_bufio_unlock(c);
 +
 +	if (!b || IS_ERR(b))
 +		return b;
 +
 +	if (need_submit)
 +		submit_io(b, READ, b->block, read_endio);
 +
 +	wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
 +
 +	if (b->read_error) {
 +		int error = b->read_error;
 +
 +		dm_bufio_release(b);
 +
 +		return ERR_PTR(error);
 +	}
 +
 +	*bp = b;
 +
 +	return b->data;
 +}
 +
 +void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
 +		   struct dm_buffer **bp)
 +{
 +	return new_read(c, block, NF_GET, bp);
 +}
 +EXPORT_SYMBOL_GPL(dm_bufio_get);
 +
 +void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
 +		    struct dm_buffer **bp)
 +{
 +	BUG_ON(dm_bufio_in_request());
 +
 +	return new_read(c, block, NF_READ, bp);
 +}
 +EXPORT_SYMBOL_GPL(dm_bufio_read);
 +
 +void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
 +		   struct dm_buffer **bp)
 +{
 +	BUG_ON(dm_bufio_in_request());
 +
 +	return new_read(c, block, NF_FRESH, bp);
 +}
 +EXPORT_SYMBOL_GPL(dm_bufio_new);
 +
 +void dm_bufio_release(struct dm_buffer *b)
 +{
 +	struct dm_bufio_client *c = b->c;
 +
 +	dm_bufio_lock(c);
 +
 +	BUG_ON(test_bit(B_READING, &b->state));
 +	BUG_ON(!b->hold_count);
 +
 +	b->hold_count--;
 +	if (!b->hold_count) {
 +		wake_up(&c->free_buffer_wait);
 +
 +		/*
 +		 * If there were errors on the buffer, and the buffer is not
 +		 * to be written, free the buffer. There is no point in caching
 +		 * invalid buffer.
 +		 */
 +		if ((b->read_error || b->write_error) &&
 +		    !test_bit(B_WRITING, &b->state) &&
 +		    !test_bit(B_DIRTY, &b->state)) {
 +			__unlink_buffer(b);
 +			__free_buffer_wake(b);
 +		}
 +	}
 +
 +	dm_bufio_unlock(c);
 +}
 +EXPORT_SYMBOL_GPL(dm_bufio_release);
 +
 +void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
 +{
 +	struct dm_bufio_client *c = b->c;
 +
 +	dm_bufio_lock(c);
 +
 +	if (!test_and_set_bit(B_DIRTY, &b->state))
 +		__relink_lru(b, LIST_DIRTY);
 +
 +	dm_bufio_unlock(c);
 +}
 +EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
 +
 +void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
 +{
 +	BUG_ON(dm_bufio_in_request());
 +
 +	dm_bufio_lock(c);
 +	__write_dirty_buffers_async(c, 0);
 +	dm_bufio_unlock(c);
 +}
 +EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
 +
 +/*
 + * For performance, it is essential that the buffers are written asynchronously
 + * and simultaneously (so that the block layer can merge the writes) and then
 + * waited upon.
 + *
 + * Finally, we flush hardware disk cache.
 + */
 +int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
 +{
 +	int a, f;
 +	unsigned long buffers_processed = 0;
 +	struct dm_buffer *b, *tmp;
 +
 +	dm_bufio_lock(c);
 +	__write_dirty_buffers_async(c, 0);
 +
 +again:
 +	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
 +		int dropped_lock = 0;
 +
 +		if (buffers_processed < c->n_buffers[LIST_DIRTY])
 +			buffers_processed++;
 +
 +		BUG_ON(test_bit(B_READING, &b->state));
 +
 +		if (test_bit(B_WRITING, &b->state)) {
 +			if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
 +				dropped_lock = 1;
 +				b->hold_count++;
 +				dm_bufio_unlock(c);
 +				wait_on_bit(&b->state, B_WRITING,
 +					    do_io_schedule,
 +					    TASK_UNINTERRUPTIBLE);
 +				dm_bufio_lock(c);
 +				b->hold_count--;
 +			} else
 +				wait_on_bit(&b->state, B_WRITING,
 +					    do_io_schedule,
 +					    TASK_UNINTERRUPTIBLE);
 +		}
 +
 +		if (!test_bit(B_DIRTY, &b->state) &&
 +		    !test_bit(B_WRITING, &b->state))
 +			__relink_lru(b, LIST_CLEAN);
 +
 +		dm_bufio_cond_resched();
 +
 +		/*
 +		 * If we dropped the lock, the list is no longer consistent,
 +		 * so we must restart the search.
 +		 *
 +		 * In the most common case, the buffer just processed is
 +		 * relinked to the clean list, so we won't loop scanning the
 +		 * same buffer again and again.
 +		 *
 +		 * This may livelock if there is another thread simultaneously
 +		 * dirtying buffers, so we count the number of buffers walked
 +		 * and if it exceeds the total number of buffers, it means that
 +		 * someone is doing some writes simultaneously with us.  In
 +		 * this case, stop, dropping the lock.
 +		 */
 +		if (dropped_lock)
 +			goto again;
 +	}
 +	wake_up(&c->free_buffer_wait);
 +	dm_bufio_unlock(c);
 +
 +	a = xchg(&c->async_write_error, 0);
 +	f = dm_bufio_issue_flush(c);
 +	if (a)
 +		return a;
 +
 +	return f;
 +}
 +EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
 +
 +/*
 + * Use dm-io to send and empty barrier flush the device.
 + */
 +int dm_bufio_issue_flush(struct dm_bufio_client *c)
 +{
 +	struct dm_io_request io_req = {
 +		.bi_rw = REQ_FLUSH,
 +		.mem.type = DM_IO_KMEM,
 +		.mem.ptr.addr = NULL,
 +		.client = c->dm_io,
 +	};
 +	struct dm_io_region io_reg = {
 +		.bdev = c->bdev,
 +		.sector = 0,
 +		.count = 0,
 +	};
 +
 +	BUG_ON(dm_bufio_in_request());
 +
 +	return dm_io(&io_req, 1, &io_reg, NULL);
 +}
 +EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
 +
 +/*
 + * We first delete any other buffer that may be at that new location.
 + *
 + * Then, we write the buffer to the original location if it was dirty.
 + *
 + * Then, if we are the only one who is holding the buffer, relink the buffer
 + * in the hash queue for the new location.
 + *
 + * If there was someone else holding the buffer, we write it to the new
 + * location but not relink it, because that other user needs to have the buffer
 + * at the same place.
 + */
 +void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
 +{
 +	struct dm_bufio_client *c = b->c;
 +	struct dm_buffer *new;
 +
 +	BUG_ON(dm_bufio_in_request());
 +
 +	dm_bufio_lock(c);
 +
 +retry:
 +	new = __find(c, new_block);
 +	if (new) {
 +		if (new->hold_count) {
 +			__wait_for_free_buffer(c);
 +			goto retry;
 +		}
 +
 +		/*
 +		 * FIXME: Is there any point waiting for a write that's going
 +		 * to be overwritten in a bit?
 +		 */
 +		__make_buffer_clean(new);
 +		__unlink_buffer(new);
 +		__free_buffer_wake(new);
 +	}
 +
 +	BUG_ON(!b->hold_count);
 +	BUG_ON(test_bit(B_READING, &b->state));
 +
 +	__write_dirty_buffer(b);
 +	if (b->hold_count == 1) {
 +		wait_on_bit(&b->state, B_WRITING,
 +			    do_io_schedule, TASK_UNINTERRUPTIBLE);
 +		set_bit(B_DIRTY, &b->state);
 +		__unlink_buffer(b);
 +		__link_buffer(b, new_block, LIST_DIRTY);
 +	} else {
 +		sector_t old_block;
 +		wait_on_bit_lock(&b->state, B_WRITING,
 +				 do_io_schedule, TASK_UNINTERRUPTIBLE);
 +		/*
 +		 * Relink buffer to "new_block" so that write_callback
 +		 * sees "new_block" as a block number.
 +		 * After the write, link the buffer back to old_block.
 +		 * All this must be done in bufio lock, so that block number
 +		 * change isn't visible to other threads.
 +		 */
 +		old_block = b->block;
 +		__unlink_buffer(b);
 +		__link_buffer(b, new_block, b->list_mode);
 +		submit_io(b, WRITE, new_block, write_endio);
 +		wait_on_bit(&b->state, B_WRITING,
 +			    do_io_schedule, TASK_UNINTERRUPTIBLE);
 +		__unlink_buffer(b);
 +		__link_buffer(b, old_block, b->list_mode);
 +	}
 +
 +	dm_bufio_unlock(c);
 +	dm_bufio_release(b);
 +}
 +EXPORT_SYMBOL_GPL(dm_bufio_release_move);
 +
 +unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
 +{
 +	return c->block_size;
 +}
 +EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
 +
 +sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
 +{
 +	return i_size_read(c->bdev->bd_inode) >>
 +			   (SECTOR_SHIFT + c->sectors_per_block_bits);
 +}
 +EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
 +
 +sector_t dm_bufio_get_block_number(struct dm_buffer *b)
 +{
 +	return b->block;
 +}
 +EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
 +
 +void *dm_bufio_get_block_data(struct dm_buffer *b)
 +{
 +	return b->data;
 +}
 +EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
 +
 +void *dm_bufio_get_aux_data(struct dm_buffer *b)
 +{
 +	return b + 1;
 +}
 +EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
 +
 +struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
 +{
 +	return b->c;
 +}
 +EXPORT_SYMBOL_GPL(dm_bufio_get_client);
 +
 +static void drop_buffers(struct dm_bufio_client *c)
 +{
 +	struct dm_buffer *b;
 +	int i;
 +
 +	BUG_ON(dm_bufio_in_request());
 +
 +	/*
 +	 * An optimization so that the buffers are not written one-by-one.
 +	 */
 +	dm_bufio_write_dirty_buffers_async(c);
 +
 +	dm_bufio_lock(c);
 +
 +	while ((b = __get_unclaimed_buffer(c)))
 +		__free_buffer_wake(b);
 +
 +	for (i = 0; i < LIST_SIZE; i++)
 +		list_for_each_entry(b, &c->lru[i], lru_list)
 +			DMERR("leaked buffer %llx, hold count %u, list %d",
 +			      (unsigned long long)b->block, b->hold_count, i);
 +
 +	for (i = 0; i < LIST_SIZE; i++)
 +		BUG_ON(!list_empty(&c->lru[i]));
 +
 +	dm_bufio_unlock(c);
 +}
 +
 +/*
 + * Test if the buffer is unused and too old, and commit it.
 + * At if noio is set, we must not do any I/O because we hold
 + * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to
 + * different bufio client.
 + */
 +static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp,
 +				unsigned long max_jiffies)
 +{
 +	if (jiffies - b->last_accessed < max_jiffies)
 +		return 1;
 +
 +	if (!(gfp & __GFP_IO)) {
 +		if (test_bit(B_READING, &b->state) ||
 +		    test_bit(B_WRITING, &b->state) ||
 +		    test_bit(B_DIRTY, &b->state))
 +			return 1;
 +	}
 +
 +	if (b->hold_count)
 +		return 1;
 +
 +	__make_buffer_clean(b);
 +	__unlink_buffer(b);
 +	__free_buffer_wake(b);
 +
 +	return 0;
 +}
 +
 +static void __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
 +		   struct shrink_control *sc)
 +{
 +	int l;
 +	struct dm_buffer *b, *tmp;
 +
 +	for (l = 0; l < LIST_SIZE; l++) {
 +		list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list)
 +			if (!__cleanup_old_buffer(b, sc->gfp_mask, 0) &&
 +			    !--nr_to_scan)
 +				return;
 +		dm_bufio_cond_resched();
 +	}
 +}
 +
 +static int shrink(struct shrinker *shrinker, struct shrink_control *sc)
 +{
 +	struct dm_bufio_client *c =
 +	    container_of(shrinker, struct dm_bufio_client, shrinker);
 +	unsigned long r;
 +	unsigned long nr_to_scan = sc->nr_to_scan;
 +
 +	if (sc->gfp_mask & __GFP_IO)
 +		dm_bufio_lock(c);
 +	else if (!dm_bufio_trylock(c))
 +		return !nr_to_scan ? 0 : -1;
 +
 +	if (nr_to_scan)
 +		__scan(c, nr_to_scan, sc);
 +
 +	r = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
 +	if (r > INT_MAX)
 +		r = INT_MAX;
 +
 +	dm_bufio_unlock(c);
 +
 +	return r;
 +}
 +
 +/*
 + * Create the buffering interface
 + */
 +struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
 +					       unsigned reserved_buffers, unsigned aux_size,
 +					       void (*alloc_callback)(struct dm_buffer *),
 +					       void (*write_callback)(struct dm_buffer *))
 +{
 +	int r;
 +	struct dm_bufio_client *c;
 +	unsigned i;
 +
 +	BUG_ON(block_size < 1 << SECTOR_SHIFT ||
 +	       (block_size & (block_size - 1)));
 +
 +	c = kmalloc(sizeof(*c), GFP_KERNEL);
 +	if (!c) {
 +		r = -ENOMEM;
 +		goto bad_client;
 +	}
 +	c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS);
 +	if (!c->cache_hash) {
 +		r = -ENOMEM;
 +		goto bad_hash;
 +	}
 +
 +	c->bdev = bdev;
 +	c->block_size = block_size;
 +	c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT;
 +	c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ?
 +				  ffs(block_size) - 1 - PAGE_SHIFT : 0;
 +	c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ?
 +				  PAGE_SHIFT - (ffs(block_size) - 1) : 0);
 +
 +	c->aux_size = aux_size;
 +	c->alloc_callback = alloc_callback;
 +	c->write_callback = write_callback;
 +
 +	for (i = 0; i < LIST_SIZE; i++) {
 +		INIT_LIST_HEAD(&c->lru[i]);
 +		c->n_buffers[i] = 0;
 +	}
 +
 +	for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
 +		INIT_HLIST_HEAD(&c->cache_hash[i]);
 +
 +	mutex_init(&c->lock);
 +	INIT_LIST_HEAD(&c->reserved_buffers);
 +	c->need_reserved_buffers = reserved_buffers;
 +
 +	init_waitqueue_head(&c->free_buffer_wait);
 +	c->async_write_error = 0;
 +
 +	c->dm_io = dm_io_client_create();
 +	if (IS_ERR(c->dm_io)) {
 +		r = PTR_ERR(c->dm_io);
 +		goto bad_dm_io;
 +	}
 +
 +	mutex_lock(&dm_bufio_clients_lock);
 +	if (c->blocks_per_page_bits) {
 +		if (!DM_BUFIO_CACHE_NAME(c)) {
 +			DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size);
 +			if (!DM_BUFIO_CACHE_NAME(c)) {
 +				r = -ENOMEM;
 +				mutex_unlock(&dm_bufio_clients_lock);
 +				goto bad_cache;
 +			}
 +		}
 +
 +		if (!DM_BUFIO_CACHE(c)) {
 +			DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c),
 +							      c->block_size,
 +							      c->block_size, 0, NULL);
 +			if (!DM_BUFIO_CACHE(c)) {
 +				r = -ENOMEM;
 +				mutex_unlock(&dm_bufio_clients_lock);
 +				goto bad_cache;
 +			}
 +		}
 +	}
 +	mutex_unlock(&dm_bufio_clients_lock);
 +
 +	while (c->need_reserved_buffers) {
 +		struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
 +
 +		if (!b) {
 +			r = -ENOMEM;
 +			goto bad_buffer;
 +		}
 +		__free_buffer_wake(b);
 +	}
 +
 +	mutex_lock(&dm_bufio_clients_lock);
 +	dm_bufio_client_count++;
 +	list_add(&c->client_list, &dm_bufio_all_clients);
 +	__cache_size_refresh();
 +	mutex_unlock(&dm_bufio_clients_lock);
 +
 +	c->shrinker.shrink = shrink;
 +	c->shrinker.seeks = 1;
 +	c->shrinker.batch = 0;
 +	register_shrinker(&c->shrinker);
 +
 +	return c;
 +
 +bad_buffer:
 +bad_cache:
 +	while (!list_empty(&c->reserved_buffers)) {
 +		struct dm_buffer *b = list_entry(c->reserved_buffers.next,
 +						 struct dm_buffer, lru_list);
 +		list_del(&b->lru_list);
 +		free_buffer(b);
 +	}
 +	dm_io_client_destroy(c->dm_io);
 +bad_dm_io:
 +	vfree(c->cache_hash);
 +bad_hash:
 +	kfree(c);
 +bad_client:
 +	return ERR_PTR(r);
 +}
 +EXPORT_SYMBOL_GPL(dm_bufio_client_create);
 +
 +/*
 + * Free the buffering interface.
 + * It is required that there are no references on any buffers.
 + */
 +void dm_bufio_client_destroy(struct dm_bufio_client *c)
 +{
 +	unsigned i;
 +
 +	drop_buffers(c);
 +
 +	unregister_shrinker(&c->shrinker);
 +
 +	mutex_lock(&dm_bufio_clients_lock);
 +
 +	list_del(&c->client_list);
 +	dm_bufio_client_count--;
 +	__cache_size_refresh();
 +
 +	mutex_unlock(&dm_bufio_clients_lock);
 +
 +	for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
 +		BUG_ON(!hlist_empty(&c->cache_hash[i]));
 +
 +	BUG_ON(c->need_reserved_buffers);
 +
 +	while (!list_empty(&c->reserved_buffers)) {
 +		struct dm_buffer *b = list_entry(c->reserved_buffers.next,
 +						 struct dm_buffer, lru_list);
 +		list_del(&b->lru_list);
 +		free_buffer(b);
 +	}
 +
 +	for (i = 0; i < LIST_SIZE; i++)
 +		if (c->n_buffers[i])
 +			DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
 +
 +	for (i = 0; i < LIST_SIZE; i++)
 +		BUG_ON(c->n_buffers[i]);
 +
 +	dm_io_client_destroy(c->dm_io);
 +	vfree(c->cache_hash);
 +	kfree(c);
 +}
 +EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
 +
 +static void cleanup_old_buffers(void)
 +{
 +	unsigned long max_age = dm_bufio_max_age;
 +	struct dm_bufio_client *c;
 +
 +	barrier();
 +
 +	if (max_age > ULONG_MAX / HZ)
 +		max_age = ULONG_MAX / HZ;
 +
 +	mutex_lock(&dm_bufio_clients_lock);
 +	list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
 +		if (!dm_bufio_trylock(c))
 +			continue;
 +
 +		while (!list_empty(&c->lru[LIST_CLEAN])) {
 +			struct dm_buffer *b;
 +			b = list_entry(c->lru[LIST_CLEAN].prev,
 +				       struct dm_buffer, lru_list);
 +			if (__cleanup_old_buffer(b, 0, max_age * HZ))
 +				break;
 +			dm_bufio_cond_resched();
 +		}
 +
 +		dm_bufio_unlock(c);
 +		dm_bufio_cond_resched();
 +	}
 +	mutex_unlock(&dm_bufio_clients_lock);
 +}
 +
 +static struct workqueue_struct *dm_bufio_wq;
 +static struct delayed_work dm_bufio_work;
 +
 +static void work_fn(struct work_struct *w)
 +{
 +	cleanup_old_buffers();
 +
 +	queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
 +			   DM_BUFIO_WORK_TIMER_SECS * HZ);
 +}
 +
 +/*----------------------------------------------------------------
 + * Module setup
 + *--------------------------------------------------------------*/
 +
 +/*
 + * This is called only once for the whole dm_bufio module.
 + * It initializes memory limit.
 + */
 +static int __init dm_bufio_init(void)
 +{
 +	__u64 mem;
 +
 +	memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
 +	memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);
 +
 +	mem = (__u64)((totalram_pages - totalhigh_pages) *
 +		      DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT;
 +
 +	if (mem > ULONG_MAX)
 +		mem = ULONG_MAX;
 +
 +#ifdef CONFIG_MMU
 +	/*
 +	 * Get the size of vmalloc space the same way as VMALLOC_TOTAL
 +	 * in fs/proc/internal.h
 +	 */
 +	if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100)
 +		mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100;
 +#endif
 +
 +	dm_bufio_default_cache_size = mem;
 +
 +	mutex_lock(&dm_bufio_clients_lock);
 +	__cache_size_refresh();
 +	mutex_unlock(&dm_bufio_clients_lock);
 +
 +	dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache");
 +	if (!dm_bufio_wq)
 +		return -ENOMEM;
 +
 +	INIT_DELAYED_WORK(&dm_bufio_work, work_fn);
 +	queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
 +			   DM_BUFIO_WORK_TIMER_SECS * HZ);
 +
 +	return 0;
 +}
 +
 +/*
 + * This is called once when unloading the dm_bufio module.
 + */
 +static void __exit dm_bufio_exit(void)
 +{
 +	int bug = 0;
 +	int i;
 +
 +	cancel_delayed_work_sync(&dm_bufio_work);
 +	destroy_workqueue(dm_bufio_wq);
 +
 +	for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) {
 +		struct kmem_cache *kc = dm_bufio_caches[i];
 +
 +		if (kc)
 +			kmem_cache_destroy(kc);
 +	}
 +
 +	for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++)
 +		kfree(dm_bufio_cache_names[i]);
 +
 +	if (dm_bufio_client_count) {
 +		DMCRIT("%s: dm_bufio_client_count leaked: %d",
 +			__func__, dm_bufio_client_count);
 +		bug = 1;
 +	}
 +
 +	if (dm_bufio_current_allocated) {
 +		DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
 +			__func__, dm_bufio_current_allocated);
 +		bug = 1;
 +	}
 +
 +	if (dm_bufio_allocated_get_free_pages) {
 +		DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
 +		       __func__, dm_bufio_allocated_get_free_pages);
 +		bug = 1;
 +	}
 +
 +	if (dm_bufio_allocated_vmalloc) {
 +		DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
 +		       __func__, dm_bufio_allocated_vmalloc);
 +		bug = 1;
 +	}
 +
 +	if (bug)
 +		BUG();
 +}
 +
 +module_init(dm_bufio_init)
 +module_exit(dm_bufio_exit)
 +
 +module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR);
 +MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
 +
 +module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
 +MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
 +
 +module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
 +MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
 +
 +module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO);
 +MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
 +
 +module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO);
 +MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
 +
 +module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO);
 +MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
 +
 +module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO);
 +MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
 +
 +MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
 +MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
 +MODULE_LICENSE("GPL");
diff --cc drivers/md/persistent-data/dm-btree-remove.c
index 65fd85ec6514,000000000000..023fbc2d389e
mode 100644,000000..100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@@ -1,566 -1,0 +1,566 @@@
 +/*
 + * Copyright (C) 2011 Red Hat, Inc.
 + *
 + * This file is released under the GPL.
 + */
 +
 +#include "dm-btree.h"
 +#include "dm-btree-internal.h"
 +#include "dm-transaction-manager.h"
 +
- #include <linux/module.h>
++#include <linux/export.h>
 +
 +/*
 + * Removing an entry from a btree
 + * ==============================
 + *
 + * A very important constraint for our btree is that no node, except the
 + * root, may have fewer than a certain number of entries.
 + * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
 + *
 + * Ensuring this is complicated by the way we want to only ever hold the
 + * locks on 2 nodes concurrently, and only change nodes in a top to bottom
 + * fashion.
 + *
 + * Each node may have a left or right sibling.  When decending the spine,
 + * if a node contains only MIN_ENTRIES then we try and increase this to at
 + * least MIN_ENTRIES + 1.  We do this in the following ways:
 + *
 + * [A] No siblings => this can only happen if the node is the root, in which
 + *     case we copy the childs contents over the root.
 + *
 + * [B] No left sibling
 + *     ==> rebalance(node, right sibling)
 + *
 + * [C] No right sibling
 + *     ==> rebalance(left sibling, node)
 + *
 + * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
 + *     ==> delete node adding it's contents to left and right
 + *
 + * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
 + *     ==> rebalance(left, node, right)
 + *
 + * After these operations it's possible that the our original node no
 + * longer contains the desired sub tree.  For this reason this rebalancing
 + * is performed on the children of the current node.  This also avoids
 + * having a special case for the root.
 + *
 + * Once this rebalancing has occurred we can then step into the child node
 + * for internal nodes.  Or delete the entry for leaf nodes.
 + */
 +
 +/*
 + * Some little utilities for moving node data around.
 + */
 +static void node_shift(struct node *n, int shift)
 +{
 +	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
 +	uint32_t value_size = le32_to_cpu(n->header.value_size);
 +
 +	if (shift < 0) {
 +		shift = -shift;
 +		BUG_ON(shift > nr_entries);
 +		BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size));
 +		memmove(key_ptr(n, 0),
 +			key_ptr(n, shift),
 +			(nr_entries - shift) * sizeof(__le64));
 +		memmove(value_ptr(n, 0, value_size),
 +			value_ptr(n, shift, value_size),
 +			(nr_entries - shift) * value_size);
 +	} else {
 +		BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
 +		memmove(key_ptr(n, shift),
 +			key_ptr(n, 0),
 +			nr_entries * sizeof(__le64));
 +		memmove(value_ptr(n, shift, value_size),
 +			value_ptr(n, 0, value_size),
 +			nr_entries * value_size);
 +	}
 +}
 +
 +static void node_copy(struct node *left, struct node *right, int shift)
 +{
 +	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 +	uint32_t value_size = le32_to_cpu(left->header.value_size);
 +	BUG_ON(value_size != le32_to_cpu(right->header.value_size));
 +
 +	if (shift < 0) {
 +		shift = -shift;
 +		BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries));
 +		memcpy(key_ptr(left, nr_left),
 +		       key_ptr(right, 0),
 +		       shift * sizeof(__le64));
 +		memcpy(value_ptr(left, nr_left, value_size),
 +		       value_ptr(right, 0, value_size),
 +		       shift * value_size);
 +	} else {
 +		BUG_ON(shift > le32_to_cpu(right->header.max_entries));
 +		memcpy(key_ptr(right, 0),
 +		       key_ptr(left, nr_left - shift),
 +		       shift * sizeof(__le64));
 +		memcpy(value_ptr(right, 0, value_size),
 +		       value_ptr(left, nr_left - shift, value_size),
 +		       shift * value_size);
 +	}
 +}
 +
 +/*
 + * Delete a specific entry from a leaf node.
 + */
 +static void delete_at(struct node *n, unsigned index)
 +{
 +	unsigned nr_entries = le32_to_cpu(n->header.nr_entries);
 +	unsigned nr_to_copy = nr_entries - (index + 1);
 +	uint32_t value_size = le32_to_cpu(n->header.value_size);
 +	BUG_ON(index >= nr_entries);
 +
 +	if (nr_to_copy) {
 +		memmove(key_ptr(n, index),
 +			key_ptr(n, index + 1),
 +			nr_to_copy * sizeof(__le64));
 +
 +		memmove(value_ptr(n, index, value_size),
 +			value_ptr(n, index + 1, value_size),
 +			nr_to_copy * value_size);
 +	}
 +
 +	n->header.nr_entries = cpu_to_le32(nr_entries - 1);
 +}
 +
 +static unsigned del_threshold(struct node *n)
 +{
 +	return le32_to_cpu(n->header.max_entries) / 3;
 +}
 +
 +static unsigned merge_threshold(struct node *n)
 +{
 +	/*
 +	 * The extra one is because we know we're potentially going to
 +	 * delete an entry.
 +	 */
 +	return 2 * (le32_to_cpu(n->header.max_entries) / 3) + 1;
 +}
 +
 +struct child {
 +	unsigned index;
 +	struct dm_block *block;
 +	struct node *n;
 +};
 +
 +static struct dm_btree_value_type le64_type = {
 +	.context = NULL,
 +	.size = sizeof(__le64),
 +	.inc = NULL,
 +	.dec = NULL,
 +	.equal = NULL
 +};
 +
 +static int init_child(struct dm_btree_info *info, struct node *parent,
 +		      unsigned index, struct child *result)
 +{
 +	int r, inc;
 +	dm_block_t root;
 +
 +	result->index = index;
 +	root = value64(parent, index);
 +
 +	r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
 +			       &result->block, &inc);
 +	if (r)
 +		return r;
 +
 +	result->n = dm_block_data(result->block);
 +
 +	if (inc)
 +		inc_children(info->tm, result->n, &le64_type);
 +
 +	*((__le64 *) value_ptr(parent, index, sizeof(__le64))) =
 +		cpu_to_le64(dm_block_location(result->block));
 +
 +	return 0;
 +}
 +
 +static int exit_child(struct dm_btree_info *info, struct child *c)
 +{
 +	return dm_tm_unlock(info->tm, c->block);
 +}
 +
 +static void shift(struct node *left, struct node *right, int count)
 +{
 +	if (!count)
 +		return;
 +
 +	if (count > 0) {
 +		node_shift(right, count);
 +		node_copy(left, right, count);
 +	} else {
 +		node_copy(left, right, count);
 +		node_shift(right, count);
 +	}
 +
 +	left->header.nr_entries =
 +		cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count);
 +	BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries));
 +
 +	right->header.nr_entries =
 +		cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count);
 +	BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries));
 +}
 +
 +static void __rebalance2(struct dm_btree_info *info, struct node *parent,
 +			 struct child *l, struct child *r)
 +{
 +	struct node *left = l->n;
 +	struct node *right = r->n;
 +	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 +	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
 +
 +	if (nr_left + nr_right <= merge_threshold(left)) {
 +		/*
 +		 * Merge
 +		 */
 +		node_copy(left, right, -nr_right);
 +		left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
 +		delete_at(parent, r->index);
 +
 +		/*
 +		 * We need to decrement the right block, but not it's
 +		 * children, since they're still referenced by left.
 +		 */
 +		dm_tm_dec(info->tm, dm_block_location(r->block));
 +	} else {
 +		/*
 +		 * Rebalance.
 +		 */
 +		unsigned target_left = (nr_left + nr_right) / 2;
 +		unsigned shift_ = nr_left - target_left;
 +		BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_);
 +		BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_);
 +		shift(left, right, nr_left - target_left);
 +		*key_ptr(parent, r->index) = right->keys[0];
 +	}
 +}
 +
 +static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
 +		      unsigned left_index)
 +{
 +	int r;
 +	struct node *parent;
 +	struct child left, right;
 +
 +	parent = dm_block_data(shadow_current(s));
 +
 +	r = init_child(info, parent, left_index, &left);
 +	if (r)
 +		return r;
 +
 +	r = init_child(info, parent, left_index + 1, &right);
 +	if (r) {
 +		exit_child(info, &left);
 +		return r;
 +	}
 +
 +	__rebalance2(info, parent, &left, &right);
 +
 +	r = exit_child(info, &left);
 +	if (r) {
 +		exit_child(info, &right);
 +		return r;
 +	}
 +
 +	return exit_child(info, &right);
 +}
 +
 +static void __rebalance3(struct dm_btree_info *info, struct node *parent,
 +			 struct child *l, struct child *c, struct child *r)
 +{
 +	struct node *left = l->n;
 +	struct node *center = c->n;
 +	struct node *right = r->n;
 +
 +	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 +	uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
 +	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
 +	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
 +
 +	unsigned target;
 +
 +	BUG_ON(left->header.max_entries != center->header.max_entries);
 +	BUG_ON(center->header.max_entries != right->header.max_entries);
 +
 +	if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) {
 +		/*
 +		 * Delete center node:
 +		 *
 +		 * We dump as many entries from center as possible into
 +		 * left, then the rest in right, then rebalance2.  This
 +		 * wastes some cpu, but I want something simple atm.
 +		 */
 +		unsigned shift = min(max_entries - nr_left, nr_center);
 +
 +		BUG_ON(nr_left + shift > max_entries);
 +		node_copy(left, center, -shift);
 +		left->header.nr_entries = cpu_to_le32(nr_left + shift);
 +
 +		if (shift != nr_center) {
 +			shift = nr_center - shift;
 +			BUG_ON((nr_right + shift) >= max_entries);
 +			node_shift(right, shift);
 +			node_copy(center, right, shift);
 +			right->header.nr_entries = cpu_to_le32(nr_right + shift);
 +		}
 +		*key_ptr(parent, r->index) = right->keys[0];
 +
 +		delete_at(parent, c->index);
 +		r->index--;
 +
 +		dm_tm_dec(info->tm, dm_block_location(c->block));
 +		__rebalance2(info, parent, l, r);
 +
 +		return;
 +	}
 +
 +	/*
 +	 * Rebalance
 +	 */
 +	target = (nr_left + nr_center + nr_right) / 3;
 +	BUG_ON(target > max_entries);
 +
 +	/*
 +	 * Adjust the left node
 +	 */
 +	shift(left, center, nr_left - target);
 +
 +	/*
 +	 * Adjust the right node
 +	 */
 +	shift(center, right, target - nr_right);
 +	*key_ptr(parent, c->index) = center->keys[0];
 +	*key_ptr(parent, r->index) = right->keys[0];
 +}
 +
 +static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
 +		      unsigned left_index)
 +{
 +	int r;
 +	struct node *parent = dm_block_data(shadow_current(s));
 +	struct child left, center, right;
 +
 +	/*
 +	 * FIXME: fill out an array?
 +	 */
 +	r = init_child(info, parent, left_index, &left);
 +	if (r)
 +		return r;
 +
 +	r = init_child(info, parent, left_index + 1, &center);
 +	if (r) {
 +		exit_child(info, &left);
 +		return r;
 +	}
 +
 +	r = init_child(info, parent, left_index + 2, &right);
 +	if (r) {
 +		exit_child(info, &left);
 +		exit_child(info, &center);
 +		return r;
 +	}
 +
 +	__rebalance3(info, parent, &left, &center, &right);
 +
 +	r = exit_child(info, &left);
 +	if (r) {
 +		exit_child(info, &center);
 +		exit_child(info, &right);
 +		return r;
 +	}
 +
 +	r = exit_child(info, &center);
 +	if (r) {
 +		exit_child(info, &right);
 +		return r;
 +	}
 +
 +	r = exit_child(info, &right);
 +	if (r)
 +		return r;
 +
 +	return 0;
 +}
 +
 +static int get_nr_entries(struct dm_transaction_manager *tm,
 +			  dm_block_t b, uint32_t *result)
 +{
 +	int r;
 +	struct dm_block *block;
 +	struct node *n;
 +
 +	r = dm_tm_read_lock(tm, b, &btree_node_validator, &block);
 +	if (r)
 +		return r;
 +
 +	n = dm_block_data(block);
 +	*result = le32_to_cpu(n->header.nr_entries);
 +
 +	return dm_tm_unlock(tm, block);
 +}
 +
 +static int rebalance_children(struct shadow_spine *s,
 +			      struct dm_btree_info *info, uint64_t key)
 +{
 +	int i, r, has_left_sibling, has_right_sibling;
 +	uint32_t child_entries;
 +	struct node *n;
 +
 +	n = dm_block_data(shadow_current(s));
 +
 +	if (le32_to_cpu(n->header.nr_entries) == 1) {
 +		struct dm_block *child;
 +		dm_block_t b = value64(n, 0);
 +
 +		r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
 +		if (r)
 +			return r;
 +
 +		memcpy(n, dm_block_data(child),
 +		       dm_bm_block_size(dm_tm_get_bm(info->tm)));
 +		r = dm_tm_unlock(info->tm, child);
 +		if (r)
 +			return r;
 +
 +		dm_tm_dec(info->tm, dm_block_location(child));
 +		return 0;
 +	}
 +
 +	i = lower_bound(n, key);
 +	if (i < 0)
 +		return -ENODATA;
 +
 +	r = get_nr_entries(info->tm, value64(n, i), &child_entries);
 +	if (r)
 +		return r;
 +
 +	if (child_entries > del_threshold(n))
 +		return 0;
 +
 +	has_left_sibling = i > 0;
 +	has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
 +
 +	if (!has_left_sibling)
 +		r = rebalance2(s, info, i);
 +
 +	else if (!has_right_sibling)
 +		r = rebalance2(s, info, i - 1);
 +
 +	else
 +		r = rebalance3(s, info, i - 1);
 +
 +	return r;
 +}
 +
 +static int do_leaf(struct node *n, uint64_t key, unsigned *index)
 +{
 +	int i = lower_bound(n, key);
 +
 +	if ((i < 0) ||
 +	    (i >= le32_to_cpu(n->header.nr_entries)) ||
 +	    (le64_to_cpu(n->keys[i]) != key))
 +		return -ENODATA;
 +
 +	*index = i;
 +
 +	return 0;
 +}
 +
 +/*
 + * Prepares for removal from one level of the hierarchy.  The caller must
 + * call delete_at() to remove the entry at index.
 + */
 +static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
 +		      struct dm_btree_value_type *vt, dm_block_t root,
 +		      uint64_t key, unsigned *index)
 +{
 +	int i = *index, r;
 +	struct node *n;
 +
 +	for (;;) {
 +		r = shadow_step(s, root, vt);
 +		if (r < 0)
 +			break;
 +
 +		/*
 +		 * We have to patch up the parent node, ugly, but I don't
 +		 * see a way to do this automatically as part of the spine
 +		 * op.
 +		 */
 +		if (shadow_has_parent(s)) {
 +			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
 +			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)),
 +			       &location, sizeof(__le64));
 +		}
 +
 +		n = dm_block_data(shadow_current(s));
 +
 +		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
 +			return do_leaf(n, key, index);
 +
 +		r = rebalance_children(s, info, key);
 +		if (r)
 +			break;
 +
 +		n = dm_block_data(shadow_current(s));
 +		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
 +			return do_leaf(n, key, index);
 +
 +		i = lower_bound(n, key);
 +
 +		/*
 +		 * We know the key is present, or else
 +		 * rebalance_children would have returned
 +		 * -ENODATA
 +		 */
 +		root = value64(n, i);
 +	}
 +
 +	return r;
 +}
 +
 +int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 +		    uint64_t *keys, dm_block_t *new_root)
 +{
 +	unsigned level, last_level = info->levels - 1;
 +	int index = 0, r = 0;
 +	struct shadow_spine spine;
 +	struct node *n;
 +
 +	init_shadow_spine(&spine, info);
 +	for (level = 0; level < info->levels; level++) {
 +		r = remove_raw(&spine, info,
 +			       (level == last_level ?
 +				&info->value_type : &le64_type),
 +			       root, keys[level], (unsigned *)&index);
 +		if (r < 0)
 +			break;
 +
 +		n = dm_block_data(shadow_current(&spine));
 +		if (level != last_level) {
 +			root = value64(n, index);
 +			continue;
 +		}
 +
 +		BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
 +
 +		if (info->value_type.dec)
 +			info->value_type.dec(info->value_type.context,
 +					     value_ptr(n, index, info->value_type.size));
 +
 +		delete_at(n, index);
 +	}
 +
 +	*new_root = shadow_root(&spine);
 +	exit_shadow_spine(&spine);
 +
 +	return r;
 +}
 +EXPORT_SYMBOL_GPL(dm_btree_remove);
diff --cc drivers/md/persistent-data/dm-btree.c
index e0638be53ea4,000000000000..bd1e7ffbe26c
mode 100644,000000..100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@@ -1,805 -1,0 +1,805 @@@
 +/*
 + * Copyright (C) 2011 Red Hat, Inc.
 + *
 + * This file is released under the GPL.
 + */
 +
 +#include "dm-btree-internal.h"
 +#include "dm-space-map.h"
 +#include "dm-transaction-manager.h"
 +
- #include <linux/module.h>
++#include <linux/export.h>
 +#include <linux/device-mapper.h>
 +
 +#define DM_MSG_PREFIX "btree"
 +
 +/*----------------------------------------------------------------
 + * Array manipulation
 + *--------------------------------------------------------------*/
 +static void memcpy_disk(void *dest, const void *src, size_t len)
 +	__dm_written_to_disk(src)
 +{
 +	memcpy(dest, src, len);
 +	__dm_unbless_for_disk(src);
 +}
 +
 +static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
 +			 unsigned index, void *elt)
 +	__dm_written_to_disk(elt)
 +{
 +	if (index < nr_elts)
 +		memmove(base + (elt_size * (index + 1)),
 +			base + (elt_size * index),
 +			(nr_elts - index) * elt_size);
 +
 +	memcpy_disk(base + (elt_size * index), elt, elt_size);
 +}
 +
 +/*----------------------------------------------------------------*/
 +
 +/* makes the assumption that no two keys are the same. */
 +static int bsearch(struct node *n, uint64_t key, int want_hi)
 +{
 +	int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
 +
 +	while (hi - lo > 1) {
 +		int mid = lo + ((hi - lo) / 2);
 +		uint64_t mid_key = le64_to_cpu(n->keys[mid]);
 +
 +		if (mid_key == key)
 +			return mid;
 +
 +		if (mid_key < key)
 +			lo = mid;
 +		else
 +			hi = mid;
 +	}
 +
 +	return want_hi ? hi : lo;
 +}
 +
 +int lower_bound(struct node *n, uint64_t key)
 +{
 +	return bsearch(n, key, 0);
 +}
 +
 +void inc_children(struct dm_transaction_manager *tm, struct node *n,
 +		  struct dm_btree_value_type *vt)
 +{
 +	unsigned i;
 +	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
 +
 +	if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
 +		for (i = 0; i < nr_entries; i++)
 +			dm_tm_inc(tm, value64(n, i));
 +	else if (vt->inc)
 +		for (i = 0; i < nr_entries; i++)
 +			vt->inc(vt->context,
 +				value_ptr(n, i, vt->size));
 +}
 +
 +static int insert_at(size_t value_size, struct node *node, unsigned index,
 +		      uint64_t key, void *value)
 +		      __dm_written_to_disk(value)
 +{
 +	uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
 +	__le64 key_le = cpu_to_le64(key);
 +
 +	if (index > nr_entries ||
 +	    index >= le32_to_cpu(node->header.max_entries)) {
 +		DMERR("too many entries in btree node for insert");
 +		__dm_unbless_for_disk(value);
 +		return -ENOMEM;
 +	}
 +
 +	__dm_bless_for_disk(&key_le);
 +
 +	array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
 +	array_insert(value_base(node), value_size, nr_entries, index, value);
 +	node->header.nr_entries = cpu_to_le32(nr_entries + 1);
 +
 +	return 0;
 +}
 +
 +/*----------------------------------------------------------------*/
 +
 +/*
 + * We want 3n entries (for some n).  This works more nicely for repeated
 + * insert remove loops than (2n + 1).
 + */
 +static uint32_t calc_max_entries(size_t value_size, size_t block_size)
 +{
 +	uint32_t total, n;
 +	size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
 +
 +	block_size -= sizeof(struct node_header);
 +	total = block_size / elt_size;
 +	n = total / 3;		/* rounds down */
 +
 +	return 3 * n;
 +}
 +
 +int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
 +{
 +	int r;
 +	struct dm_block *b;
 +	struct node *n;
 +	size_t block_size;
 +	uint32_t max_entries;
 +
 +	r = new_block(info, &b);
 +	if (r < 0)
 +		return r;
 +
 +	block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
 +	max_entries = calc_max_entries(info->value_type.size, block_size);
 +
 +	n = dm_block_data(b);
 +	memset(n, 0, block_size);
 +	n->header.flags = cpu_to_le32(LEAF_NODE);
 +	n->header.nr_entries = cpu_to_le32(0);
 +	n->header.max_entries = cpu_to_le32(max_entries);
 +	n->header.value_size = cpu_to_le32(info->value_type.size);
 +
 +	*root = dm_block_location(b);
 +	return unlock_block(info, b);
 +}
 +EXPORT_SYMBOL_GPL(dm_btree_empty);
 +
 +/*----------------------------------------------------------------*/
 +
 +/*
 + * Deletion uses a recursive algorithm, since we have limited stack space
 + * we explicitly manage our own stack on the heap.
 + */
 +#define MAX_SPINE_DEPTH 64
 +struct frame {
 +	struct dm_block *b;
 +	struct node *n;
 +	unsigned level;
 +	unsigned nr_children;
 +	unsigned current_child;
 +};
 +
 +struct del_stack {
 +	struct dm_transaction_manager *tm;
 +	int top;
 +	struct frame spine[MAX_SPINE_DEPTH];
 +};
 +
 +static int top_frame(struct del_stack *s, struct frame **f)
 +{
 +	if (s->top < 0) {
 +		DMERR("btree deletion stack empty");
 +		return -EINVAL;
 +	}
 +
 +	*f = s->spine + s->top;
 +
 +	return 0;
 +}
 +
 +static int unprocessed_frames(struct del_stack *s)
 +{
 +	return s->top >= 0;
 +}
 +
 +static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
 +{
 +	int r;
 +	uint32_t ref_count;
 +
 +	if (s->top >= MAX_SPINE_DEPTH - 1) {
 +		DMERR("btree deletion stack out of memory");
 +		return -ENOMEM;
 +	}
 +
 +	r = dm_tm_ref(s->tm, b, &ref_count);
 +	if (r)
 +		return r;
 +
 +	if (ref_count > 1)
 +		/*
 +		 * This is a shared node, so we can just decrement it's
 +		 * reference counter and leave the children.
 +		 */
 +		dm_tm_dec(s->tm, b);
 +
 +	else {
 +		struct frame *f = s->spine + ++s->top;
 +
 +		r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
 +		if (r) {
 +			s->top--;
 +			return r;
 +		}
 +
 +		f->n = dm_block_data(f->b);
 +		f->level = level;
 +		f->nr_children = le32_to_cpu(f->n->header.nr_entries);
 +		f->current_child = 0;
 +	}
 +
 +	return 0;
 +}
 +
 +static void pop_frame(struct del_stack *s)
 +{
 +	struct frame *f = s->spine + s->top--;
 +
 +	dm_tm_dec(s->tm, dm_block_location(f->b));
 +	dm_tm_unlock(s->tm, f->b);
 +}
 +
 +int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
 +{
 +	int r;
 +	struct del_stack *s;
 +
 +	s = kmalloc(sizeof(*s), GFP_KERNEL);
 +	if (!s)
 +		return -ENOMEM;
 +	s->tm = info->tm;
 +	s->top = -1;
 +
 +	r = push_frame(s, root, 1);
 +	if (r)
 +		goto out;
 +
 +	while (unprocessed_frames(s)) {
 +		uint32_t flags;
 +		struct frame *f;
 +		dm_block_t b;
 +
 +		r = top_frame(s, &f);
 +		if (r)
 +			goto out;
 +
 +		if (f->current_child >= f->nr_children) {
 +			pop_frame(s);
 +			continue;
 +		}
 +
 +		flags = le32_to_cpu(f->n->header.flags);
 +		if (flags & INTERNAL_NODE) {
 +			b = value64(f->n, f->current_child);
 +			f->current_child++;
 +			r = push_frame(s, b, f->level);
 +			if (r)
 +				goto out;
 +
 +		} else if (f->level != (info->levels - 1)) {
 +			b = value64(f->n, f->current_child);
 +			f->current_child++;
 +			r = push_frame(s, b, f->level + 1);
 +			if (r)
 +				goto out;
 +
 +		} else {
 +			if (info->value_type.dec) {
 +				unsigned i;
 +
 +				for (i = 0; i < f->nr_children; i++)
 +					info->value_type.dec(info->value_type.context,
 +							     value_ptr(f->n, i, info->value_type.size));
 +			}
 +			f->current_child = f->nr_children;
 +		}
 +	}
 +
 +out:
 +	kfree(s);
 +	return r;
 +}
 +EXPORT_SYMBOL_GPL(dm_btree_del);
 +
 +/*----------------------------------------------------------------*/
 +
 +static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
 +			    int (*search_fn)(struct node *, uint64_t),
 +			    uint64_t *result_key, void *v, size_t value_size)
 +{
 +	int i, r;
 +	uint32_t flags, nr_entries;
 +
 +	do {
 +		r = ro_step(s, block);
 +		if (r < 0)
 +			return r;
 +
 +		i = search_fn(ro_node(s), key);
 +
 +		flags = le32_to_cpu(ro_node(s)->header.flags);
 +		nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
 +		if (i < 0 || i >= nr_entries)
 +			return -ENODATA;
 +
 +		if (flags & INTERNAL_NODE)
 +			block = value64(ro_node(s), i);
 +
 +	} while (!(flags & LEAF_NODE));
 +
 +	*result_key = le64_to_cpu(ro_node(s)->keys[i]);
 +	memcpy(v, value_ptr(ro_node(s), i, value_size), value_size);
 +
 +	return 0;
 +}
 +
 +int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
 +		    uint64_t *keys, void *value_le)
 +{
 +	unsigned level, last_level = info->levels - 1;
 +	int r = -ENODATA;
 +	uint64_t rkey;
 +	__le64 internal_value_le;
 +	struct ro_spine spine;
 +
 +	init_ro_spine(&spine, info);
 +	for (level = 0; level < info->levels; level++) {
 +		size_t size;
 +		void *value_p;
 +
 +		if (level == last_level) {
 +			value_p = value_le;
 +			size = info->value_type.size;
 +
 +		} else {
 +			value_p = &internal_value_le;
 +			size = sizeof(uint64_t);
 +		}
 +
 +		r = btree_lookup_raw(&spine, root, keys[level],
 +				     lower_bound, &rkey,
 +				     value_p, size);
 +
 +		if (!r) {
 +			if (rkey != keys[level]) {
 +				exit_ro_spine(&spine);
 +				return -ENODATA;
 +			}
 +		} else {
 +			exit_ro_spine(&spine);
 +			return r;
 +		}
 +
 +		root = le64_to_cpu(internal_value_le);
 +	}
 +	exit_ro_spine(&spine);
 +
 +	return r;
 +}
 +EXPORT_SYMBOL_GPL(dm_btree_lookup);
 +
 +/*
 + * Splits a node by creating a sibling node and shifting half the nodes
 + * contents across.  Assumes there is a parent node, and it has room for
 + * another child.
 + *
 + * Before:
 + *	  +--------+
 + *	  | Parent |
 + *	  +--------+
 + *	     |
 + *	     v
 + *	+----------+
 + *	| A ++++++ |
 + *	+----------+
 + *
 + *
 + * After:
 + *		+--------+
 + *		| Parent |
 + *		+--------+
 + *		  |	|
 + *		  v	+------+
 + *	    +---------+	       |
 + *	    | A* +++  |	       v
 + *	    +---------+	  +-------+
 + *			  | B +++ |
 + *			  +-------+
 + *
 + * Where A* is a shadow of A.
 + */
 +static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
 +			       unsigned parent_index, uint64_t key)
 +{
 +	int r;
 +	size_t size;
 +	unsigned nr_left, nr_right;
 +	struct dm_block *left, *right, *parent;
 +	struct node *ln, *rn, *pn;
 +	__le64 location;
 +
 +	left = shadow_current(s);
 +
 +	r = new_block(s->info, &right);
 +	if (r < 0)
 +		return r;
 +
 +	ln = dm_block_data(left);
 +	rn = dm_block_data(right);
 +
 +	nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
 +	nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
 +
 +	ln->header.nr_entries = cpu_to_le32(nr_left);
 +
 +	rn->header.flags = ln->header.flags;
 +	rn->header.nr_entries = cpu_to_le32(nr_right);
 +	rn->header.max_entries = ln->header.max_entries;
 +	rn->header.value_size = ln->header.value_size;
 +	memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
 +
 +	size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
 +		sizeof(uint64_t) : s->info->value_type.size;
 +	memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size),
 +	       size * nr_right);
 +
 +	/*
 +	 * Patch up the parent
 +	 */
 +	parent = shadow_parent(s);
 +
 +	pn = dm_block_data(parent);
 +	location = cpu_to_le64(dm_block_location(left));
 +	__dm_bless_for_disk(&location);
 +	memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)),
 +		    &location, sizeof(__le64));
 +
 +	location = cpu_to_le64(dm_block_location(right));
 +	__dm_bless_for_disk(&location);
 +
 +	r = insert_at(sizeof(__le64), pn, parent_index + 1,
 +		      le64_to_cpu(rn->keys[0]), &location);
 +	if (r)
 +		return r;
 +
 +	if (key < le64_to_cpu(rn->keys[0])) {
 +		unlock_block(s->info, right);
 +		s->nodes[1] = left;
 +	} else {
 +		unlock_block(s->info, left);
 +		s->nodes[1] = right;
 +	}
 +
 +	return 0;
 +}
 +
 +/*
 + * Splits a node by creating two new children beneath the given node.
 + *
 + * Before:
 + *	  +----------+
 + *	  | A ++++++ |
 + *	  +----------+
 + *
 + *
 + * After:
 + *	+------------+
 + *	| A (shadow) |
 + *	+------------+
 + *	    |	|
 + *   +------+	+----+
 + *   |		     |
 + *   v		     v
 + * +-------+	 +-------+
 + * | B +++ |	 | C +++ |
 + * +-------+	 +-------+
 + */
 +static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
 +{
 +	int r;
 +	size_t size;
 +	unsigned nr_left, nr_right;
 +	struct dm_block *left, *right, *new_parent;
 +	struct node *pn, *ln, *rn;
 +	__le64 val;
 +
 +	new_parent = shadow_current(s);
 +
 +	r = new_block(s->info, &left);
 +	if (r < 0)
 +		return r;
 +
 +	r = new_block(s->info, &right);
 +	if (r < 0) {
 +		/* FIXME: put left */
 +		return r;
 +	}
 +
 +	pn = dm_block_data(new_parent);
 +	ln = dm_block_data(left);
 +	rn = dm_block_data(right);
 +
 +	nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
 +	nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
 +
 +	ln->header.flags = pn->header.flags;
 +	ln->header.nr_entries = cpu_to_le32(nr_left);
 +	ln->header.max_entries = pn->header.max_entries;
 +	ln->header.value_size = pn->header.value_size;
 +
 +	rn->header.flags = pn->header.flags;
 +	rn->header.nr_entries = cpu_to_le32(nr_right);
 +	rn->header.max_entries = pn->header.max_entries;
 +	rn->header.value_size = pn->header.value_size;
 +
 +	memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
 +	memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
 +
 +	size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
 +		sizeof(__le64) : s->info->value_type.size;
 +	memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size);
 +	memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size),
 +	       nr_right * size);
 +
 +	/* new_parent should just point to l and r now */
 +	pn->header.flags = cpu_to_le32(INTERNAL_NODE);
 +	pn->header.nr_entries = cpu_to_le32(2);
 +	pn->header.max_entries = cpu_to_le32(
 +		calc_max_entries(sizeof(__le64),
 +				 dm_bm_block_size(
 +					 dm_tm_get_bm(s->info->tm))));
 +	pn->header.value_size = cpu_to_le32(sizeof(__le64));
 +
 +	val = cpu_to_le64(dm_block_location(left));
 +	__dm_bless_for_disk(&val);
 +	pn->keys[0] = ln->keys[0];
 +	memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64));
 +
 +	val = cpu_to_le64(dm_block_location(right));
 +	__dm_bless_for_disk(&val);
 +	pn->keys[1] = rn->keys[0];
 +	memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64));
 +
 +	/*
 +	 * rejig the spine.  This is ugly, since it knows too
 +	 * much about the spine
 +	 */
 +	if (s->nodes[0] != new_parent) {
 +		unlock_block(s->info, s->nodes[0]);
 +		s->nodes[0] = new_parent;
 +	}
 +	if (key < le64_to_cpu(rn->keys[0])) {
 +		unlock_block(s->info, right);
 +		s->nodes[1] = left;
 +	} else {
 +		unlock_block(s->info, left);
 +		s->nodes[1] = right;
 +	}
 +	s->count = 2;
 +
 +	return 0;
 +}
 +
 +static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
 +			    struct dm_btree_value_type *vt,
 +			    uint64_t key, unsigned *index)
 +{
 +	int r, i = *index, top = 1;
 +	struct node *node;
 +
 +	for (;;) {
 +		r = shadow_step(s, root, vt);
 +		if (r < 0)
 +			return r;
 +
 +		node = dm_block_data(shadow_current(s));
 +
 +		/*
 +		 * We have to patch up the parent node, ugly, but I don't
 +		 * see a way to do this automatically as part of the spine
 +		 * op.
 +		 */
 +		if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
 +			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
 +
 +			__dm_bless_for_disk(&location);
 +			memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)),
 +				    &location, sizeof(__le64));
 +		}
 +
 +		node = dm_block_data(shadow_current(s));
 +
 +		if (node->header.nr_entries == node->header.max_entries) {
 +			if (top)
 +				r = btree_split_beneath(s, key);
 +			else
 +				r = btree_split_sibling(s, root, i, key);
 +
 +			if (r < 0)
 +				return r;
 +		}
 +
 +		node = dm_block_data(shadow_current(s));
 +
 +		i = lower_bound(node, key);
 +
 +		if (le32_to_cpu(node->header.flags) & LEAF_NODE)
 +			break;
 +
 +		if (i < 0) {
 +			/* change the bounds on the lowest key */
 +			node->keys[0] = cpu_to_le64(key);
 +			i = 0;
 +		}
 +
 +		root = value64(node, i);
 +		top = 0;
 +	}
 +
 +	if (i < 0 || le64_to_cpu(node->keys[i]) != key)
 +		i++;
 +
 +	*index = i;
 +	return 0;
 +}
 +
 +static int insert(struct dm_btree_info *info, dm_block_t root,
 +		  uint64_t *keys, void *value, dm_block_t *new_root,
 +		  int *inserted)
 +		  __dm_written_to_disk(value)
 +{
 +	int r, need_insert;
 +	unsigned level, index = -1, last_level = info->levels - 1;
 +	dm_block_t block = root;
 +	struct shadow_spine spine;
 +	struct node *n;
 +	struct dm_btree_value_type le64_type;
 +
 +	le64_type.context = NULL;
 +	le64_type.size = sizeof(__le64);
 +	le64_type.inc = NULL;
 +	le64_type.dec = NULL;
 +	le64_type.equal = NULL;
 +
 +	init_shadow_spine(&spine, info);
 +
 +	for (level = 0; level < (info->levels - 1); level++) {
 +		r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
 +		if (r < 0)
 +			goto bad;
 +
 +		n = dm_block_data(shadow_current(&spine));
 +		need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
 +			       (le64_to_cpu(n->keys[index]) != keys[level]));
 +
 +		if (need_insert) {
 +			dm_block_t new_tree;
 +			__le64 new_le;
 +
 +			r = dm_btree_empty(info, &new_tree);
 +			if (r < 0)
 +				goto bad;
 +
 +			new_le = cpu_to_le64(new_tree);
 +			__dm_bless_for_disk(&new_le);
 +
 +			r = insert_at(sizeof(uint64_t), n, index,
 +				      keys[level], &new_le);
 +			if (r)
 +				goto bad;
 +		}
 +
 +		if (level < last_level)
 +			block = value64(n, index);
 +	}
 +
 +	r = btree_insert_raw(&spine, block, &info->value_type,
 +			     keys[level], &index);
 +	if (r < 0)
 +		goto bad;
 +
 +	n = dm_block_data(shadow_current(&spine));
 +	need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
 +		       (le64_to_cpu(n->keys[index]) != keys[level]));
 +
 +	if (need_insert) {
 +		if (inserted)
 +			*inserted = 1;
 +
 +		r = insert_at(info->value_type.size, n, index,
 +			      keys[level], value);
 +		if (r)
 +			goto bad_unblessed;
 +	} else {
 +		if (inserted)
 +			*inserted = 0;
 +
 +		if (info->value_type.dec &&
 +		    (!info->value_type.equal ||
 +		     !info->value_type.equal(
 +			     info->value_type.context,
 +			     value_ptr(n, index, info->value_type.size),
 +			     value))) {
 +			info->value_type.dec(info->value_type.context,
 +					     value_ptr(n, index, info->value_type.size));
 +		}
 +		memcpy_disk(value_ptr(n, index, info->value_type.size),
 +			    value, info->value_type.size);
 +	}
 +
 +	*new_root = shadow_root(&spine);
 +	exit_shadow_spine(&spine);
 +
 +	return 0;
 +
 +bad:
 +	__dm_unbless_for_disk(value);
 +bad_unblessed:
 +	exit_shadow_spine(&spine);
 +	return r;
 +}
 +
 +int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
 +		    uint64_t *keys, void *value, dm_block_t *new_root)
 +		    __dm_written_to_disk(value)
 +{
 +	return insert(info, root, keys, value, new_root, NULL);
 +}
 +EXPORT_SYMBOL_GPL(dm_btree_insert);
 +
 +int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
 +			   uint64_t *keys, void *value, dm_block_t *new_root,
 +			   int *inserted)
 +			   __dm_written_to_disk(value)
 +{
 +	return insert(info, root, keys, value, new_root, inserted);
 +}
 +EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
 +
 +/*----------------------------------------------------------------*/
 +
 +static int find_highest_key(struct ro_spine *s, dm_block_t block,
 +			    uint64_t *result_key, dm_block_t *next_block)
 +{
 +	int i, r;
 +	uint32_t flags;
 +
 +	do {
 +		r = ro_step(s, block);
 +		if (r < 0)
 +			return r;
 +
 +		flags = le32_to_cpu(ro_node(s)->header.flags);
 +		i = le32_to_cpu(ro_node(s)->header.nr_entries);
 +		if (!i)
 +			return -ENODATA;
 +		else
 +			i--;
 +
 +		*result_key = le64_to_cpu(ro_node(s)->keys[i]);
 +		if (next_block || flags & INTERNAL_NODE)
 +			block = value64(ro_node(s), i);
 +
 +	} while (flags & INTERNAL_NODE);
 +
 +	if (next_block)
 +		*next_block = block;
 +	return 0;
 +}
 +
 +int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
 +			      uint64_t *result_keys)
 +{
 +	int r = 0, count = 0, level;
 +	struct ro_spine spine;
 +
 +	init_ro_spine(&spine, info);
 +	for (level = 0; level < info->levels; level++) {
 +		r = find_highest_key(&spine, root, result_keys + level,
 +				     level == info->levels - 1 ? NULL : &root);
 +		if (r == -ENODATA) {
 +			r = 0;
 +			break;
 +
 +		} else if (r)
 +			break;
 +
 +		count++;
 +	}
 +	exit_ro_spine(&spine);
 +
 +	return r ? r : count;
 +}
 +EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
diff --cc drivers/md/persistent-data/dm-space-map-checker.c
index bb44a937fe63,000000000000..50ed53bf4aa2
mode 100644,000000..100644
--- a/drivers/md/persistent-data/dm-space-map-checker.c
+++ b/drivers/md/persistent-data/dm-space-map-checker.c
@@@ -1,437 -1,0 +1,438 @@@
 +/*
 + * Copyright (C) 2011 Red Hat, Inc.
 + *
 + * This file is released under the GPL.
 + */
 +
 +#include "dm-space-map-checker.h"
 +
 +#include <linux/device-mapper.h>
++#include <linux/export.h>
 +
 +#ifdef CONFIG_DM_DEBUG_SPACE_MAPS
 +
 +#define DM_MSG_PREFIX "space map checker"
 +
 +/*----------------------------------------------------------------*/
 +
 +struct count_array {
 +	dm_block_t nr;
 +	dm_block_t nr_free;
 +
 +	uint32_t *counts;
 +};
 +
 +static int ca_get_count(struct count_array *ca, dm_block_t b, uint32_t *count)
 +{
 +	if (b >= ca->nr)
 +		return -EINVAL;
 +
 +	*count = ca->counts[b];
 +	return 0;
 +}
 +
 +static int ca_count_more_than_one(struct count_array *ca, dm_block_t b, int *r)
 +{
 +	if (b >= ca->nr)
 +		return -EINVAL;
 +
 +	*r = ca->counts[b] > 1;
 +	return 0;
 +}
 +
 +static int ca_set_count(struct count_array *ca, dm_block_t b, uint32_t count)
 +{
 +	uint32_t old_count;
 +
 +	if (b >= ca->nr)
 +		return -EINVAL;
 +
 +	old_count = ca->counts[b];
 +
 +	if (!count && old_count)
 +		ca->nr_free++;
 +
 +	else if (count && !old_count)
 +		ca->nr_free--;
 +
 +	ca->counts[b] = count;
 +	return 0;
 +}
 +
 +static int ca_inc_block(struct count_array *ca, dm_block_t b)
 +{
 +	if (b >= ca->nr)
 +		return -EINVAL;
 +
 +	ca_set_count(ca, b, ca->counts[b] + 1);
 +	return 0;
 +}
 +
 +static int ca_dec_block(struct count_array *ca, dm_block_t b)
 +{
 +	if (b >= ca->nr)
 +		return -EINVAL;
 +
 +	BUG_ON(ca->counts[b] == 0);
 +	ca_set_count(ca, b, ca->counts[b] - 1);
 +	return 0;
 +}
 +
 +static int ca_create(struct count_array *ca, struct dm_space_map *sm)
 +{
 +	int r;
 +	dm_block_t nr_blocks;
 +
 +	r = dm_sm_get_nr_blocks(sm, &nr_blocks);
 +	if (r)
 +		return r;
 +
 +	ca->nr = nr_blocks;
 +	ca->nr_free = nr_blocks;
 +	ca->counts = kzalloc(sizeof(*ca->counts) * nr_blocks, GFP_KERNEL);
 +	if (!ca->counts)
 +		return -ENOMEM;
 +
 +	return 0;
 +}
 +
 +static int ca_load(struct count_array *ca, struct dm_space_map *sm)
 +{
 +	int r;
 +	uint32_t count;
 +	dm_block_t nr_blocks, i;
 +
 +	r = dm_sm_get_nr_blocks(sm, &nr_blocks);
 +	if (r)
 +		return r;
 +
 +	BUG_ON(ca->nr != nr_blocks);
 +
 +	DMWARN("Loading debug space map from disk.  This may take some time");
 +	for (i = 0; i < nr_blocks; i++) {
 +		r = dm_sm_get_count(sm, i, &count);
 +		if (r) {
 +			DMERR("load failed");
 +			return r;
 +		}
 +
 +		ca_set_count(ca, i, count);
 +	}
 +	DMWARN("Load complete");
 +
 +	return 0;
 +}
 +
 +static int ca_extend(struct count_array *ca, dm_block_t extra_blocks)
 +{
 +	dm_block_t nr_blocks = ca->nr + extra_blocks;
 +	uint32_t *counts = kzalloc(sizeof(*counts) * nr_blocks, GFP_KERNEL);
 +	if (!counts)
 +		return -ENOMEM;
 +
 +	memcpy(counts, ca->counts, sizeof(*counts) * ca->nr);
 +	kfree(ca->counts);
 +	ca->nr = nr_blocks;
 +	ca->nr_free += extra_blocks;
 +	ca->counts = counts;
 +	return 0;
 +}
 +
 +static int ca_commit(struct count_array *old, struct count_array *new)
 +{
 +	if (old->nr != new->nr) {
 +		BUG_ON(old->nr > new->nr);
 +		ca_extend(old, new->nr - old->nr);
 +	}
 +
 +	BUG_ON(old->nr != new->nr);
 +	old->nr_free = new->nr_free;
 +	memcpy(old->counts, new->counts, sizeof(*old->counts) * old->nr);
 +	return 0;
 +}
 +
 +static void ca_destroy(struct count_array *ca)
 +{
 +	kfree(ca->counts);
 +}
 +
 +/*----------------------------------------------------------------*/
 +
 +struct sm_checker {
 +	struct dm_space_map sm;
 +
 +	struct count_array old_counts;
 +	struct count_array counts;
 +
 +	struct dm_space_map *real_sm;
 +};
 +
 +static void sm_checker_destroy(struct dm_space_map *sm)
 +{
 +	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
 +
 +	dm_sm_destroy(smc->real_sm);
 +	ca_destroy(&smc->old_counts);
 +	ca_destroy(&smc->counts);
 +	kfree(smc);
 +}
 +
 +static int sm_checker_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
 +{
 +	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
 +	int r = dm_sm_get_nr_blocks(smc->real_sm, count);
 +	if (!r)
 +		BUG_ON(smc->old_counts.nr != *count);
 +	return r;
 +}
 +
 +static int sm_checker_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
 +{
 +	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
 +	int r = dm_sm_get_nr_free(smc->real_sm, count);
 +	if (!r) {
 +		/*
 +		 * Slow, but we know it's correct.
 +		 */
 +		dm_block_t b, n = 0;
 +		for (b = 0; b < smc->old_counts.nr; b++)
 +			if (smc->old_counts.counts[b] == 0 &&
 +			    smc->counts.counts[b] == 0)
 +				n++;
 +
 +		if (n != *count)
 +			DMERR("free block counts differ, checker %u, sm-disk:%u",
 +			      (unsigned) n, (unsigned) *count);
 +	}
 +	return r;
 +}
 +
 +static int sm_checker_new_block(struct dm_space_map *sm, dm_block_t *b)
 +{
 +	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
 +	int r = dm_sm_new_block(smc->real_sm, b);
 +
 +	if (!r) {
 +		BUG_ON(*b >= smc->old_counts.nr);
 +		BUG_ON(smc->old_counts.counts[*b] != 0);
 +		BUG_ON(*b >= smc->counts.nr);
 +		BUG_ON(smc->counts.counts[*b] != 0);
 +		ca_set_count(&smc->counts, *b, 1);
 +	}
 +
 +	return r;
 +}
 +
 +static int sm_checker_inc_block(struct dm_space_map *sm, dm_block_t b)
 +{
 +	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
 +	int r = dm_sm_inc_block(smc->real_sm, b);
 +	int r2 = ca_inc_block(&smc->counts, b);
 +	BUG_ON(r != r2);
 +	return r;
 +}
 +
 +static int sm_checker_dec_block(struct dm_space_map *sm, dm_block_t b)
 +{
 +	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
 +	int r = dm_sm_dec_block(smc->real_sm, b);
 +	int r2 = ca_dec_block(&smc->counts, b);
 +	BUG_ON(r != r2);
 +	return r;
 +}
 +
 +static int sm_checker_get_count(struct dm_space_map *sm, dm_block_t b, uint32_t *result)
 +{
 +	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
 +	uint32_t result2 = 0;
 +	int r = dm_sm_get_count(smc->real_sm, b, result);
 +	int r2 = ca_get_count(&smc->counts, b, &result2);
 +
 +	BUG_ON(r != r2);
 +	if (!r)
 +		BUG_ON(*result != result2);
 +	return r;
 +}
 +
 +static int sm_checker_count_more_than_one(struct dm_space_map *sm, dm_block_t b, int *result)
 +{
 +	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
 +	int result2 = 0;
 +	int r = dm_sm_count_is_more_than_one(smc->real_sm, b, result);
 +	int r2 = ca_count_more_than_one(&smc->counts, b, &result2);
 +
 +	BUG_ON(r != r2);
 +	if (!r)
 +		BUG_ON(!(*result) && result2);
 +	return r;
 +}
 +
 +static int sm_checker_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count)
 +{
 +	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
 +	uint32_t old_rc;
 +	int r = dm_sm_set_count(smc->real_sm, b, count);
 +	int r2;
 +
 +	BUG_ON(b >= smc->counts.nr);
 +	old_rc = smc->counts.counts[b];
 +	r2 = ca_set_count(&smc->counts, b, count);
 +	BUG_ON(r != r2);
 +
 +	return r;
 +}
 +
 +static int sm_checker_commit(struct dm_space_map *sm)
 +{
 +	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
 +	int r;
 +
 +	r = dm_sm_commit(smc->real_sm);
 +	if (r)
 +		return r;
 +
 +	r = ca_commit(&smc->old_counts, &smc->counts);
 +	if (r)
 +		return r;
 +
 +	return 0;
 +}
 +
 +static int sm_checker_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
 +{
 +	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
 +	int r = dm_sm_extend(smc->real_sm, extra_blocks);
 +	if (r)
 +		return r;
 +
 +	return ca_extend(&smc->counts, extra_blocks);
 +}
 +
 +static int sm_checker_root_size(struct dm_space_map *sm, size_t *result)
 +{
 +	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
 +	return dm_sm_root_size(smc->real_sm, result);
 +}
 +
 +static int sm_checker_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len)
 +{
 +	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
 +	return dm_sm_copy_root(smc->real_sm, copy_to_here_le, len);
 +}
 +
 +/*----------------------------------------------------------------*/
 +
 +static struct dm_space_map ops_ = {
 +	.destroy = sm_checker_destroy,
 +	.get_nr_blocks = sm_checker_get_nr_blocks,
 +	.get_nr_free = sm_checker_get_nr_free,
 +	.inc_block = sm_checker_inc_block,
 +	.dec_block = sm_checker_dec_block,
 +	.new_block = sm_checker_new_block,
 +	.get_count = sm_checker_get_count,
 +	.count_is_more_than_one = sm_checker_count_more_than_one,
 +	.set_count = sm_checker_set_count,
 +	.commit = sm_checker_commit,
 +	.extend = sm_checker_extend,
 +	.root_size = sm_checker_root_size,
 +	.copy_root = sm_checker_copy_root
 +};
 +
 +struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm)
 +{
 +	int r;
 +	struct sm_checker *smc;
 +
 +	if (!sm)
 +		return NULL;
 +
 +	smc = kmalloc(sizeof(*smc), GFP_KERNEL);
 +	if (!smc)
 +		return NULL;
 +
 +	memcpy(&smc->sm, &ops_, sizeof(smc->sm));
 +	r = ca_create(&smc->old_counts, sm);
 +	if (r) {
 +		kfree(smc);
 +		return NULL;
 +	}
 +
 +	r = ca_create(&smc->counts, sm);
 +	if (r) {
 +		ca_destroy(&smc->old_counts);
 +		kfree(smc);
 +		return NULL;
 +	}
 +
 +	smc->real_sm = sm;
 +
 +	r = ca_load(&smc->counts, sm);
 +	if (r) {
 +		ca_destroy(&smc->counts);
 +		ca_destroy(&smc->old_counts);
 +		kfree(smc);
 +		return NULL;
 +	}
 +
 +	r = ca_commit(&smc->old_counts, &smc->counts);
 +	if (r) {
 +		ca_destroy(&smc->counts);
 +		ca_destroy(&smc->old_counts);
 +		kfree(smc);
 +		return NULL;
 +	}
 +
 +	return &smc->sm;
 +}
 +EXPORT_SYMBOL_GPL(dm_sm_checker_create);
 +
 +struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm)
 +{
 +	int r;
 +	struct sm_checker *smc;
 +
 +	if (!sm)
 +		return NULL;
 +
 +	smc = kmalloc(sizeof(*smc), GFP_KERNEL);
 +	if (!smc)
 +		return NULL;
 +
 +	memcpy(&smc->sm, &ops_, sizeof(smc->sm));
 +	r = ca_create(&smc->old_counts, sm);
 +	if (r) {
 +		kfree(smc);
 +		return NULL;
 +	}
 +
 +	r = ca_create(&smc->counts, sm);
 +	if (r) {
 +		ca_destroy(&smc->old_counts);
 +		kfree(smc);
 +		return NULL;
 +	}
 +
 +	smc->real_sm = sm;
 +	return &smc->sm;
 +}
 +EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh);
 +
 +/*----------------------------------------------------------------*/
 +
 +#else
 +
 +struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm)
 +{
 +	return sm;
 +}
 +EXPORT_SYMBOL_GPL(dm_sm_checker_create);
 +
 +struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm)
 +{
 +	return sm;
 +}
 +EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh);
 +
 +/*----------------------------------------------------------------*/
 +
 +#endif
diff --cc drivers/md/persistent-data/dm-space-map-disk.c
index aeff7852cf79,000000000000..fc469ba9f627
mode 100644,000000..100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@@ -1,335 -1,0 +1,335 @@@
 +/*
 + * Copyright (C) 2011 Red Hat, Inc.
 + *
 + * This file is released under the GPL.
 + */
 +
 +#include "dm-space-map-checker.h"
 +#include "dm-space-map-common.h"
 +#include "dm-space-map-disk.h"
 +#include "dm-space-map.h"
 +#include "dm-transaction-manager.h"
 +
 +#include <linux/list.h>
 +#include <linux/slab.h>
- #include <linux/module.h>
++#include <linux/export.h>
 +#include <linux/device-mapper.h>
 +
 +#define DM_MSG_PREFIX "space map disk"
 +
 +/*----------------------------------------------------------------*/
 +
 +/*
 + * Space map interface.
 + */
 +struct sm_disk {
 +	struct dm_space_map sm;
 +
 +	struct ll_disk ll;
 +	struct ll_disk old_ll;
 +
 +	dm_block_t begin;
 +	dm_block_t nr_allocated_this_transaction;
 +};
 +
 +static void sm_disk_destroy(struct dm_space_map *sm)
 +{
 +	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 +
 +	kfree(smd);
 +}
 +
 +static int sm_disk_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
 +{
 +	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 +
 +	return sm_ll_extend(&smd->ll, extra_blocks);
 +}
 +
 +static int sm_disk_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
 +{
 +	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 +	*count = smd->old_ll.nr_blocks;
 +
 +	return 0;
 +}
 +
 +static int sm_disk_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
 +{
 +	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 +	*count = (smd->old_ll.nr_blocks - smd->old_ll.nr_allocated) - smd->nr_allocated_this_transaction;
 +
 +	return 0;
 +}
 +
 +static int sm_disk_get_count(struct dm_space_map *sm, dm_block_t b,
 +			     uint32_t *result)
 +{
 +	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 +	return sm_ll_lookup(&smd->ll, b, result);
 +}
 +
 +static int sm_disk_count_is_more_than_one(struct dm_space_map *sm, dm_block_t b,
 +					  int *result)
 +{
 +	int r;
 +	uint32_t count;
 +
 +	r = sm_disk_get_count(sm, b, &count);
 +	if (r)
 +		return r;
 +
 +	return count > 1;
 +}
 +
 +static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b,
 +			     uint32_t count)
 +{
 +	int r;
 +	uint32_t old_count;
 +	enum allocation_event ev;
 +	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 +
 +	r = sm_ll_insert(&smd->ll, b, count, &ev);
 +	if (!r) {
 +		switch (ev) {
 +		case SM_NONE:
 +			break;
 +
 +		case SM_ALLOC:
 +			/*
 +			 * This _must_ be free in the prior transaction
 +			 * otherwise we've lost atomicity.
 +			 */
 +			smd->nr_allocated_this_transaction++;
 +			break;
 +
 +		case SM_FREE:
 +			/*
 +			 * It's only free if it's also free in the last
 +			 * transaction.
 +			 */
 +			r = sm_ll_lookup(&smd->old_ll, b, &old_count);
 +			if (r)
 +				return r;
 +
 +			if (!old_count)
 +				smd->nr_allocated_this_transaction--;
 +			break;
 +		}
 +	}
 +
 +	return r;
 +}
 +
 +static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
 +{
 +	int r;
 +	enum allocation_event ev;
 +	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 +
 +	r = sm_ll_inc(&smd->ll, b, &ev);
 +	if (!r && (ev == SM_ALLOC))
 +		/*
 +		 * This _must_ be free in the prior transaction
 +		 * otherwise we've lost atomicity.
 +		 */
 +		smd->nr_allocated_this_transaction++;
 +
 +	return r;
 +}
 +
 +static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
 +{
 +	int r;
 +	uint32_t old_count;
 +	enum allocation_event ev;
 +	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 +
 +	r = sm_ll_dec(&smd->ll, b, &ev);
 +	if (!r && (ev == SM_FREE)) {
 +		/*
 +		 * It's only free if it's also free in the last
 +		 * transaction.
 +		 */
 +		r = sm_ll_lookup(&smd->old_ll, b, &old_count);
 +		if (r)
 +			return r;
 +
 +		if (!old_count)
 +			smd->nr_allocated_this_transaction--;
 +	}
 +
 +	return r;
 +}
 +
 +static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
 +{
 +	int r;
 +	enum allocation_event ev;
 +	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 +
 +	/* FIXME: we should loop round a couple of times */
 +	r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b);
 +	if (r)
 +		return r;
 +
 +	smd->begin = *b + 1;
 +	r = sm_ll_inc(&smd->ll, *b, &ev);
 +	if (!r) {
 +		BUG_ON(ev != SM_ALLOC);
 +		smd->nr_allocated_this_transaction++;
 +	}
 +
 +	return r;
 +}
 +
 +static int sm_disk_commit(struct dm_space_map *sm)
 +{
 +	int r;
 +	dm_block_t nr_free;
 +	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 +
 +	r = sm_disk_get_nr_free(sm, &nr_free);
 +	if (r)
 +		return r;
 +
 +	r = sm_ll_commit(&smd->ll);
 +	if (r)
 +		return r;
 +
 +	memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll));
 +	smd->begin = 0;
 +	smd->nr_allocated_this_transaction = 0;
 +
 +	r = sm_disk_get_nr_free(sm, &nr_free);
 +	if (r)
 +		return r;
 +
 +	return 0;
 +}
 +
 +static int sm_disk_root_size(struct dm_space_map *sm, size_t *result)
 +{
 +	*result = sizeof(struct disk_sm_root);
 +
 +	return 0;
 +}
 +
 +static int sm_disk_copy_root(struct dm_space_map *sm, void *where_le, size_t max)
 +{
 +	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 +	struct disk_sm_root root_le;
 +
 +	root_le.nr_blocks = cpu_to_le64(smd->ll.nr_blocks);
 +	root_le.nr_allocated = cpu_to_le64(smd->ll.nr_allocated);
 +	root_le.bitmap_root = cpu_to_le64(smd->ll.bitmap_root);
 +	root_le.ref_count_root = cpu_to_le64(smd->ll.ref_count_root);
 +
 +	if (max < sizeof(root_le))
 +		return -ENOSPC;
 +
 +	memcpy(where_le, &root_le, sizeof(root_le));
 +
 +	return 0;
 +}
 +
 +/*----------------------------------------------------------------*/
 +
 +static struct dm_space_map ops = {
 +	.destroy = sm_disk_destroy,
 +	.extend = sm_disk_extend,
 +	.get_nr_blocks = sm_disk_get_nr_blocks,
 +	.get_nr_free = sm_disk_get_nr_free,
 +	.get_count = sm_disk_get_count,
 +	.count_is_more_than_one = sm_disk_count_is_more_than_one,
 +	.set_count = sm_disk_set_count,
 +	.inc_block = sm_disk_inc_block,
 +	.dec_block = sm_disk_dec_block,
 +	.new_block = sm_disk_new_block,
 +	.commit = sm_disk_commit,
 +	.root_size = sm_disk_root_size,
 +	.copy_root = sm_disk_copy_root
 +};
 +
 +static struct dm_space_map *dm_sm_disk_create_real(
 +	struct dm_transaction_manager *tm,
 +	dm_block_t nr_blocks)
 +{
 +	int r;
 +	struct sm_disk *smd;
 +
 +	smd = kmalloc(sizeof(*smd), GFP_KERNEL);
 +	if (!smd)
 +		return ERR_PTR(-ENOMEM);
 +
 +	smd->begin = 0;
 +	smd->nr_allocated_this_transaction = 0;
 +	memcpy(&smd->sm, &ops, sizeof(smd->sm));
 +
 +	r = sm_ll_new_disk(&smd->ll, tm);
 +	if (r)
 +		goto bad;
 +
 +	r = sm_ll_extend(&smd->ll, nr_blocks);
 +	if (r)
 +		goto bad;
 +
 +	r = sm_disk_commit(&smd->sm);
 +	if (r)
 +		goto bad;
 +
 +	return &smd->sm;
 +
 +bad:
 +	kfree(smd);
 +	return ERR_PTR(r);
 +}
 +
 +struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
 +				       dm_block_t nr_blocks)
 +{
 +	struct dm_space_map *sm = dm_sm_disk_create_real(tm, nr_blocks);
 +	return dm_sm_checker_create_fresh(sm);
 +}
 +EXPORT_SYMBOL_GPL(dm_sm_disk_create);
 +
 +static struct dm_space_map *dm_sm_disk_open_real(
 +	struct dm_transaction_manager *tm,
 +	void *root_le, size_t len)
 +{
 +	int r;
 +	struct sm_disk *smd;
 +
 +	smd = kmalloc(sizeof(*smd), GFP_KERNEL);
 +	if (!smd)
 +		return ERR_PTR(-ENOMEM);
 +
 +	smd->begin = 0;
 +	smd->nr_allocated_this_transaction = 0;
 +	memcpy(&smd->sm, &ops, sizeof(smd->sm));
 +
 +	r = sm_ll_open_disk(&smd->ll, tm, root_le, len);
 +	if (r)
 +		goto bad;
 +
 +	r = sm_disk_commit(&smd->sm);
 +	if (r)
 +		goto bad;
 +
 +	return &smd->sm;
 +
 +bad:
 +	kfree(smd);
 +	return ERR_PTR(r);
 +}
 +
 +struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
 +				     void *root_le, size_t len)
 +{
 +	return dm_sm_checker_create(
 +		dm_sm_disk_open_real(tm, root_le, len));
 +}
 +EXPORT_SYMBOL_GPL(dm_sm_disk_open);
 +
 +/*----------------------------------------------------------------*/
diff --cc drivers/md/persistent-data/dm-transaction-manager.c
index 728e89a3f978,000000000000..6f8d38747d7f
mode 100644,000000..100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@@ -1,400 -1,0 +1,400 @@@
 +/*
 + * Copyright (C) 2011 Red Hat, Inc.
 + *
 + * This file is released under the GPL.
 + */
 +#include "dm-transaction-manager.h"
 +#include "dm-space-map.h"
 +#include "dm-space-map-checker.h"
 +#include "dm-space-map-disk.h"
 +#include "dm-space-map-metadata.h"
 +#include "dm-persistent-data-internal.h"
 +
- #include <linux/module.h>
++#include <linux/export.h>
 +#include <linux/slab.h>
 +#include <linux/device-mapper.h>
 +
 +#define DM_MSG_PREFIX "transaction manager"
 +
 +/*----------------------------------------------------------------*/
 +
 +struct shadow_info {
 +	struct hlist_node hlist;
 +	dm_block_t where;
 +};
 +
 +/*
 + * It would be nice if we scaled with the size of transaction.
 + */
 +#define HASH_SIZE 256
 +#define HASH_MASK (HASH_SIZE - 1)
 +
 +struct dm_transaction_manager {
 +	int is_clone;
 +	struct dm_transaction_manager *real;
 +
 +	struct dm_block_manager *bm;
 +	struct dm_space_map *sm;
 +
 +	spinlock_t lock;
 +	struct hlist_head buckets[HASH_SIZE];
 +};
 +
 +/*----------------------------------------------------------------*/
 +
 +static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b)
 +{
 +	int r = 0;
 +	unsigned bucket = dm_hash_block(b, HASH_MASK);
 +	struct shadow_info *si;
 +	struct hlist_node *n;
 +
 +	spin_lock(&tm->lock);
 +	hlist_for_each_entry(si, n, tm->buckets + bucket, hlist)
 +		if (si->where == b) {
 +			r = 1;
 +			break;
 +		}
 +	spin_unlock(&tm->lock);
 +
 +	return r;
 +}
 +
 +/*
 + * This can silently fail if there's no memory.  We're ok with this since
 + * creating redundant shadows causes no harm.
 + */
 +static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b)
 +{
 +	unsigned bucket;
 +	struct shadow_info *si;
 +
 +	si = kmalloc(sizeof(*si), GFP_NOIO);
 +	if (si) {
 +		si->where = b;
 +		bucket = dm_hash_block(b, HASH_MASK);
 +		spin_lock(&tm->lock);
 +		hlist_add_head(&si->hlist, tm->buckets + bucket);
 +		spin_unlock(&tm->lock);
 +	}
 +}
 +
 +static void wipe_shadow_table(struct dm_transaction_manager *tm)
 +{
 +	struct shadow_info *si;
 +	struct hlist_node *n, *tmp;
 +	struct hlist_head *bucket;
 +	int i;
 +
 +	spin_lock(&tm->lock);
 +	for (i = 0; i < HASH_SIZE; i++) {
 +		bucket = tm->buckets + i;
 +		hlist_for_each_entry_safe(si, n, tmp, bucket, hlist)
 +			kfree(si);
 +
 +		INIT_HLIST_HEAD(bucket);
 +	}
 +
 +	spin_unlock(&tm->lock);
 +}
 +
 +/*----------------------------------------------------------------*/
 +
 +static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
 +						   struct dm_space_map *sm)
 +{
 +	int i;
 +	struct dm_transaction_manager *tm;
 +
 +	tm = kmalloc(sizeof(*tm), GFP_KERNEL);
 +	if (!tm)
 +		return ERR_PTR(-ENOMEM);
 +
 +	tm->is_clone = 0;
 +	tm->real = NULL;
 +	tm->bm = bm;
 +	tm->sm = sm;
 +
 +	spin_lock_init(&tm->lock);
 +	for (i = 0; i < HASH_SIZE; i++)
 +		INIT_HLIST_HEAD(tm->buckets + i);
 +
 +	return tm;
 +}
 +
 +struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transaction_manager *real)
 +{
 +	struct dm_transaction_manager *tm;
 +
 +	tm = kmalloc(sizeof(*tm), GFP_KERNEL);
 +	if (tm) {
 +		tm->is_clone = 1;
 +		tm->real = real;
 +	}
 +
 +	return tm;
 +}
 +EXPORT_SYMBOL_GPL(dm_tm_create_non_blocking_clone);
 +
 +void dm_tm_destroy(struct dm_transaction_manager *tm)
 +{
 +	kfree(tm);
 +}
 +EXPORT_SYMBOL_GPL(dm_tm_destroy);
 +
 +int dm_tm_pre_commit(struct dm_transaction_manager *tm)
 +{
 +	int r;
 +
 +	if (tm->is_clone)
 +		return -EWOULDBLOCK;
 +
 +	r = dm_sm_commit(tm->sm);
 +	if (r < 0)
 +		return r;
 +
 +	return 0;
 +}
 +EXPORT_SYMBOL_GPL(dm_tm_pre_commit);
 +
 +int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root)
 +{
 +	if (tm->is_clone)
 +		return -EWOULDBLOCK;
 +
 +	wipe_shadow_table(tm);
 +
 +	return dm_bm_flush_and_unlock(tm->bm, root);
 +}
 +EXPORT_SYMBOL_GPL(dm_tm_commit);
 +
 +int dm_tm_new_block(struct dm_transaction_manager *tm,
 +		    struct dm_block_validator *v,
 +		    struct dm_block **result)
 +{
 +	int r;
 +	dm_block_t new_block;
 +
 +	if (tm->is_clone)
 +		return -EWOULDBLOCK;
 +
 +	r = dm_sm_new_block(tm->sm, &new_block);
 +	if (r < 0)
 +		return r;
 +
 +	r = dm_bm_write_lock_zero(tm->bm, new_block, v, result);
 +	if (r < 0) {
 +		dm_sm_dec_block(tm->sm, new_block);
 +		return r;
 +	}
 +
 +	/*
 +	 * New blocks count as shadows in that they don't need to be
 +	 * shadowed again.
 +	 */
 +	insert_shadow(tm, new_block);
 +
 +	return 0;
 +}
 +
 +static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
 +			  struct dm_block_validator *v,
 +			  struct dm_block **result)
 +{
 +	int r;
 +	dm_block_t new;
 +	struct dm_block *orig_block;
 +
 +	r = dm_sm_new_block(tm->sm, &new);
 +	if (r < 0)
 +		return r;
 +
 +	r = dm_sm_dec_block(tm->sm, orig);
 +	if (r < 0)
 +		return r;
 +
 +	r = dm_bm_read_lock(tm->bm, orig, v, &orig_block);
 +	if (r < 0)
 +		return r;
 +
 +	r = dm_bm_unlock_move(orig_block, new);
 +	if (r < 0) {
 +		dm_bm_unlock(orig_block);
 +		return r;
 +	}
 +
 +	return dm_bm_write_lock(tm->bm, new, v, result);
 +}
 +
 +int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
 +		       struct dm_block_validator *v, struct dm_block **result,
 +		       int *inc_children)
 +{
 +	int r;
 +
 +	if (tm->is_clone)
 +		return -EWOULDBLOCK;
 +
 +	r = dm_sm_count_is_more_than_one(tm->sm, orig, inc_children);
 +	if (r < 0)
 +		return r;
 +
 +	if (is_shadow(tm, orig) && !*inc_children)
 +		return dm_bm_write_lock(tm->bm, orig, v, result);
 +
 +	r = __shadow_block(tm, orig, v, result);
 +	if (r < 0)
 +		return r;
 +	insert_shadow(tm, dm_block_location(*result));
 +
 +	return r;
 +}
 +
 +int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
 +		    struct dm_block_validator *v,
 +		    struct dm_block **blk)
 +{
 +	if (tm->is_clone)
 +		return dm_bm_read_try_lock(tm->real->bm, b, v, blk);
 +
 +	return dm_bm_read_lock(tm->bm, b, v, blk);
 +}
 +
 +int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b)
 +{
 +	return dm_bm_unlock(b);
 +}
 +EXPORT_SYMBOL_GPL(dm_tm_unlock);
 +
 +void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b)
 +{
 +	/*
 +	 * The non-blocking clone doesn't support this.
 +	 */
 +	BUG_ON(tm->is_clone);
 +
 +	dm_sm_inc_block(tm->sm, b);
 +}
 +EXPORT_SYMBOL_GPL(dm_tm_inc);
 +
 +void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b)
 +{
 +	/*
 +	 * The non-blocking clone doesn't support this.
 +	 */
 +	BUG_ON(tm->is_clone);
 +
 +	dm_sm_dec_block(tm->sm, b);
 +}
 +EXPORT_SYMBOL_GPL(dm_tm_dec);
 +
 +int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
 +	      uint32_t *result)
 +{
 +	if (tm->is_clone)
 +		return -EWOULDBLOCK;
 +
 +	return dm_sm_get_count(tm->sm, b, result);
 +}
 +
 +struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm)
 +{
 +	return tm->bm;
 +}
 +
 +/*----------------------------------------------------------------*/
 +
 +static int dm_tm_create_internal(struct dm_block_manager *bm,
 +				 dm_block_t sb_location,
 +				 struct dm_block_validator *sb_validator,
 +				 size_t root_offset, size_t root_max_len,
 +				 struct dm_transaction_manager **tm,
 +				 struct dm_space_map **sm,
 +				 struct dm_block **sblock,
 +				 int create)
 +{
 +	int r;
 +	struct dm_space_map *inner;
 +
 +	inner = dm_sm_metadata_init();
 +	if (IS_ERR(inner))
 +		return PTR_ERR(inner);
 +
 +	*tm = dm_tm_create(bm, inner);
 +	if (IS_ERR(*tm)) {
 +		dm_sm_destroy(inner);
 +		return PTR_ERR(*tm);
 +	}
 +
 +	if (create) {
 +		r = dm_bm_write_lock_zero(dm_tm_get_bm(*tm), sb_location,
 +					  sb_validator, sblock);
 +		if (r < 0) {
 +			DMERR("couldn't lock superblock");
 +			goto bad1;
 +		}
 +
 +		r = dm_sm_metadata_create(inner, *tm, dm_bm_nr_blocks(bm),
 +					  sb_location);
 +		if (r) {
 +			DMERR("couldn't create metadata space map");
 +			goto bad2;
 +		}
 +
 +		*sm = dm_sm_checker_create(inner);
 +		if (!*sm)
 +			goto bad2;
 +
 +	} else {
 +		r = dm_bm_write_lock(dm_tm_get_bm(*tm), sb_location,
 +				     sb_validator, sblock);
 +		if (r < 0) {
 +			DMERR("couldn't lock superblock");
 +			goto bad1;
 +		}
 +
 +		r = dm_sm_metadata_open(inner, *tm,
 +					dm_block_data(*sblock) + root_offset,
 +					root_max_len);
 +		if (r) {
 +			DMERR("couldn't open metadata space map");
 +			goto bad2;
 +		}
 +
 +		*sm = dm_sm_checker_create(inner);
 +		if (!*sm)
 +			goto bad2;
 +	}
 +
 +	return 0;
 +
 +bad2:
 +	dm_tm_unlock(*tm, *sblock);
 +bad1:
 +	dm_tm_destroy(*tm);
 +	dm_sm_destroy(inner);
 +	return r;
 +}
 +
 +int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
 +			 struct dm_block_validator *sb_validator,
 +			 struct dm_transaction_manager **tm,
 +			 struct dm_space_map **sm, struct dm_block **sblock)
 +{
 +	return dm_tm_create_internal(bm, sb_location, sb_validator,
 +				     0, 0, tm, sm, sblock, 1);
 +}
 +EXPORT_SYMBOL_GPL(dm_tm_create_with_sm);
 +
 +int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
 +		       struct dm_block_validator *sb_validator,
 +		       size_t root_offset, size_t root_max_len,
 +		       struct dm_transaction_manager **tm,
 +		       struct dm_space_map **sm, struct dm_block **sblock)
 +{
 +	return dm_tm_create_internal(bm, sb_location, sb_validator, root_offset,
 +				     root_max_len, tm, sm, sblock, 0);
 +}
 +EXPORT_SYMBOL_GPL(dm_tm_open_with_sm);
 +
 +/*----------------------------------------------------------------*/
diff --cc drivers/media/common/saa7146_core.c
index f5d53a202344,31e53b6a881a..d6b1cf66042d
--- a/drivers/media/common/saa7146_core.c
+++ b/drivers/media/common/saa7146_core.c
@@@ -18,9 -18,8 +18,10 @@@
      Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
  
 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 +
  #include <media/saa7146.h>
+ #include <linux/module.h>
  
  LIST_HEAD(saa7146_devices);
  DEFINE_MUTEX(saa7146_devices_lock);
diff --cc drivers/media/common/saa7146_fops.c
index a92546144eaa,e4547afcfa88..71f8e018e564
--- a/drivers/media/common/saa7146_fops.c
+++ b/drivers/media/common/saa7146_fops.c
@@@ -1,6 -1,5 +1,7 @@@
 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 +
  #include <media/saa7146_vv.h>
+ #include <linux/module.h>
  
  /****************************************************************************/
  /* resource management functions, shamelessly stolen from saa7134 driver */
diff --cc drivers/media/common/saa7146_hlp.c
index 79ad73accb27,c9c6e9a6c31d..bc1f545c95cb
--- a/drivers/media/common/saa7146_hlp.c
+++ b/drivers/media/common/saa7146_hlp.c
@@@ -1,6 -1,5 +1,7 @@@
 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 +
  #include <linux/kernel.h>
+ #include <linux/export.h>
  #include <media/saa7146_vv.h>
  
  static void calculate_output_format_register(struct saa7146_dev* saa, u32 palette, u32* clip_format)
diff --cc drivers/media/common/saa7146_video.c
index 384b358d3037,3a00253fe1ee..ce30533fd972
--- a/drivers/media/common/saa7146_video.c
+++ b/drivers/media/common/saa7146_video.c
@@@ -1,7 -1,6 +1,8 @@@
 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 +
  #include <media/saa7146_vv.h>
  #include <media/v4l2-chip-ident.h>
+ #include <linux/module.h>
  
  static int max_memory = 32;
  
diff --cc drivers/media/dvb/frontends/dibx000_common.c
index 774d507b66cc,977211fec137..43be7238311e
--- a/drivers/media/dvb/frontends/dibx000_common.c
+++ b/drivers/media/dvb/frontends/dibx000_common.c
@@@ -1,5 -1,5 +1,6 @@@
  #include <linux/i2c.h>
 +#include <linux/mutex.h>
+ #include <linux/module.h>
  
  #include "dibx000_common.h"
  
diff --cc drivers/media/video/adp1653.c
index 5914390211ff,c2594948ca3f..12eedf4d515a
--- a/drivers/media/video/adp1653.c
+++ b/drivers/media/video/adp1653.c
@@@ -31,8 -31,8 +31,9 @@@
   */
  
  #include <linux/delay.h>
+ #include <linux/module.h>
  #include <linux/i2c.h>
 +#include <linux/module.h>
  #include <linux/slab.h>
  #include <linux/version.h>
  #include <media/adp1653.h>
diff --cc drivers/media/video/imx074.c
index 8775e262bb6e,3319b40c87a4..eec75bb57203
--- a/drivers/media/video/imx074.c
+++ b/drivers/media/video/imx074.c
@@@ -12,11 -12,12 +12,12 @@@
  
  #include <linux/delay.h>
  #include <linux/i2c.h>
 +#include <linux/v4l2-mediabus.h>
  #include <linux/slab.h>
  #include <linux/videodev2.h>
+ #include <linux/module.h>
  
  #include <media/soc_camera.h>
 -#include <media/soc_mediabus.h>
  #include <media/v4l2-subdev.h>
  #include <media/v4l2-chip-ident.h>
  
diff --cc drivers/media/video/mt9m001.c
index 63ae5c61c9bf,6866a9ef3f60..e2b1029b16cd
--- a/drivers/media/video/mt9m001.c
+++ b/drivers/media/video/mt9m001.c
@@@ -12,12 -12,11 +12,13 @@@
  #include <linux/slab.h>
  #include <linux/i2c.h>
  #include <linux/log2.h>
+ #include <linux/module.h>
  
 +#include <media/soc_camera.h>
 +#include <media/soc_mediabus.h>
  #include <media/v4l2-subdev.h>
  #include <media/v4l2-chip-ident.h>
 -#include <media/soc_camera.h>
 +#include <media/v4l2-ctrls.h>
  
  /*
   * mt9m001 i2c address 0x5d
diff --cc drivers/media/video/mt9m111.c
index f023cc092c2b,66e3c3c2e606..cf2c0fb95f2f
--- a/drivers/media/video/mt9m111.c
+++ b/drivers/media/video/mt9m111.c
@@@ -13,12 -13,11 +13,13 @@@
  #include <linux/log2.h>
  #include <linux/gpio.h>
  #include <linux/delay.h>
 +#include <linux/v4l2-mediabus.h>
+ #include <linux/module.h>
  
 +#include <media/soc_camera.h>
  #include <media/v4l2-common.h>
 +#include <media/v4l2-ctrls.h>
  #include <media/v4l2-chip-ident.h>
 -#include <media/soc_camera.h>
  
  /*
   * MT9M111, MT9M112 and MT9M131:
diff --cc drivers/media/video/mt9t031.c
index 7ee84cc578b9,e6e0238eca16..0e78477452ff
--- a/drivers/media/video/mt9t031.c
+++ b/drivers/media/video/mt9t031.c
@@@ -13,8 -13,8 +13,9 @@@
  #include <linux/log2.h>
  #include <linux/pm.h>
  #include <linux/slab.h>
 +#include <linux/v4l2-mediabus.h>
  #include <linux/videodev2.h>
+ #include <linux/module.h>
  
  #include <media/soc_camera.h>
  #include <media/v4l2-chip-ident.h>
diff --cc drivers/media/video/mt9v022.c
index b6a29f7de82c,c74d6604598e..690ee0d42eeb
--- a/drivers/media/video/mt9v022.c
+++ b/drivers/media/video/mt9v022.c
@@@ -13,12 -13,11 +13,13 @@@
  #include <linux/i2c.h>
  #include <linux/delay.h>
  #include <linux/log2.h>
+ #include <linux/module.h>
  
 +#include <media/soc_camera.h>
 +#include <media/soc_mediabus.h>
  #include <media/v4l2-subdev.h>
  #include <media/v4l2-chip-ident.h>
 -#include <media/soc_camera.h>
 +#include <media/v4l2-ctrls.h>
  
  /*
   * mt9v022 i2c address 0x48, 0x4c, 0x58, 0x5c
diff --cc drivers/media/video/ov6650.c
index d5b057207a7b,2e1680631f0f..9f2d26b1d4cb
--- a/drivers/media/video/ov6650.c
+++ b/drivers/media/video/ov6650.c
@@@ -28,7 -28,7 +28,8 @@@
  #include <linux/delay.h>
  #include <linux/i2c.h>
  #include <linux/slab.h>
 +#include <linux/v4l2-mediabus.h>
+ #include <linux/module.h>
  
  #include <media/soc_camera.h>
  #include <media/v4l2-chip-ident.h>
diff --cc drivers/media/video/rj54n1cb0c.c
index 6afc61689549,985965f744ff..9937386a3bae
--- a/drivers/media/video/rj54n1cb0c.c
+++ b/drivers/media/video/rj54n1cb0c.c
@@@ -11,8 -11,8 +11,9 @@@
  #include <linux/delay.h>
  #include <linux/i2c.h>
  #include <linux/slab.h>
 +#include <linux/v4l2-mediabus.h>
  #include <linux/videodev2.h>
+ #include <linux/module.h>
  
  #include <media/rj54n1cb0c.h>
  #include <media/soc_camera.h>
diff --cc drivers/media/video/v4l2-device.c
index 9fc0ae8a526a,c742b1f5e73e..0edd618b9ddf
--- a/drivers/media/video/v4l2-device.c
+++ b/drivers/media/video/v4l2-device.c
@@@ -20,8 -20,8 +20,9 @@@
  
  #include <linux/types.h>
  #include <linux/ioctl.h>
+ #include <linux/module.h>
  #include <linux/i2c.h>
 +#include <linux/slab.h>
  #if defined(CONFIG_SPI)
  #include <linux/spi/spi.h>
  #endif
diff --cc drivers/mfd/max8997.c
index dc58750bb71b,50ad93bb49dd..5be53ae9b61c
--- a/drivers/mfd/max8997.c
+++ b/drivers/mfd/max8997.c
@@@ -23,8 -23,8 +23,9 @@@
  
  #include <linux/slab.h>
  #include <linux/i2c.h>
 +#include <linux/interrupt.h>
  #include <linux/pm_runtime.h>
+ #include <linux/module.h>
  #include <linux/mutex.h>
  #include <linux/mfd/core.h>
  #include <linux/mfd/max8997.h>
diff --cc drivers/s390/char/vmur.c
index d291a54acfad,b95cbdccc11a..85f4a9a5d12e
--- a/drivers/s390/char/vmur.c
+++ b/drivers/s390/char/vmur.c
@@@ -11,8 -11,10 +11,9 @@@
  #define KMSG_COMPONENT "vmur"
  #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  
 -#include <linux/kernel_stat.h>
  #include <linux/cdev.h>
  #include <linux/slab.h>
+ #include <linux/module.h>
  
  #include <asm/uaccess.h>
  #include <asm/cio.h>
diff --cc drivers/tty/hvc/hvc_opal.c
index 7b38512d6c41,000000000000..ced26c8ccd57
mode 100644,000000..100644
--- a/drivers/tty/hvc/hvc_opal.c
+++ b/drivers/tty/hvc/hvc_opal.c
@@@ -1,424 -1,0 +1,425 @@@
 +/*
 + * opal driver interface to hvc_console.c
 + *
 + * Copyright 2011 Benjamin Herrenschmidt <benh@kernel.crashing.org>, IBM Corp.
 + *
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License as published by
 + * the Free Software Foundation; either version 2 of the License, or
 + * (at your option) any later version.
 + *
 + * This program is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + * GNU General Public License for more details.
 + *
 + * You should have received a copy of the GNU General Public License
 + * along with this program; if not, write to the Free Software
 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 + *
 + */
 +
 +#undef DEBUG
 +
 +#include <linux/types.h>
 +#include <linux/init.h>
 +#include <linux/delay.h>
 +#include <linux/slab.h>
 +#include <linux/console.h>
 +#include <linux/of.h>
 +#include <linux/of_platform.h>
++#include <linux/export.h>
 +
 +#include <asm/hvconsole.h>
 +#include <asm/prom.h>
 +#include <asm/firmware.h>
 +#include <asm/hvsi.h>
 +#include <asm/udbg.h>
 +#include <asm/opal.h>
 +
 +#include "hvc_console.h"
 +
 +static const char hvc_opal_name[] = "hvc_opal";
 +
 +static struct of_device_id hvc_opal_match[] __devinitdata = {
 +	{ .name = "serial", .compatible = "ibm,opal-console-raw" },
 +	{ .name = "serial", .compatible = "ibm,opal-console-hvsi" },
 +	{ },
 +};
 +
 +typedef enum hv_protocol {
 +	HV_PROTOCOL_RAW,
 +	HV_PROTOCOL_HVSI
 +} hv_protocol_t;
 +
 +struct hvc_opal_priv {
 +	hv_protocol_t		proto;	/* Raw data or HVSI packets */
 +	struct hvsi_priv	hvsi;	/* HVSI specific data */
 +};
 +static struct hvc_opal_priv *hvc_opal_privs[MAX_NR_HVC_CONSOLES];
 +
 +/* For early boot console */
 +static struct hvc_opal_priv hvc_opal_boot_priv;
 +static u32 hvc_opal_boot_termno;
 +
 +static const struct hv_ops hvc_opal_raw_ops = {
 +	.get_chars = opal_get_chars,
 +	.put_chars = opal_put_chars,
 +	.notifier_add = notifier_add_irq,
 +	.notifier_del = notifier_del_irq,
 +	.notifier_hangup = notifier_hangup_irq,
 +};
 +
 +static int hvc_opal_hvsi_get_chars(uint32_t vtermno, char *buf, int count)
 +{
 +	struct hvc_opal_priv *pv = hvc_opal_privs[vtermno];
 +
 +	if (WARN_ON(!pv))
 +		return -ENODEV;
 +
 +	return hvsilib_get_chars(&pv->hvsi, buf, count);
 +}
 +
 +static int hvc_opal_hvsi_put_chars(uint32_t vtermno, const char *buf, int count)
 +{
 +	struct hvc_opal_priv *pv = hvc_opal_privs[vtermno];
 +
 +	if (WARN_ON(!pv))
 +		return -ENODEV;
 +
 +	return hvsilib_put_chars(&pv->hvsi, buf, count);
 +}
 +
 +static int hvc_opal_hvsi_open(struct hvc_struct *hp, int data)
 +{
 +	struct hvc_opal_priv *pv = hvc_opal_privs[hp->vtermno];
 +	int rc;
 +
 +	pr_devel("HVSI@%x: do open !\n", hp->vtermno);
 +
 +	rc = notifier_add_irq(hp, data);
 +	if (rc)
 +		return rc;
 +
 +	return hvsilib_open(&pv->hvsi, hp);
 +}
 +
 +static void hvc_opal_hvsi_close(struct hvc_struct *hp, int data)
 +{
 +	struct hvc_opal_priv *pv = hvc_opal_privs[hp->vtermno];
 +
 +	pr_devel("HVSI@%x: do close !\n", hp->vtermno);
 +
 +	hvsilib_close(&pv->hvsi, hp);
 +
 +	notifier_del_irq(hp, data);
 +}
 +
 +void hvc_opal_hvsi_hangup(struct hvc_struct *hp, int data)
 +{
 +	struct hvc_opal_priv *pv = hvc_opal_privs[hp->vtermno];
 +
 +	pr_devel("HVSI@%x: do hangup !\n", hp->vtermno);
 +
 +	hvsilib_close(&pv->hvsi, hp);
 +
 +	notifier_hangup_irq(hp, data);
 +}
 +
 +static int hvc_opal_hvsi_tiocmget(struct hvc_struct *hp)
 +{
 +	struct hvc_opal_priv *pv = hvc_opal_privs[hp->vtermno];
 +
 +	if (!pv)
 +		return -EINVAL;
 +	return pv->hvsi.mctrl;
 +}
 +
 +static int hvc_opal_hvsi_tiocmset(struct hvc_struct *hp, unsigned int set,
 +				unsigned int clear)
 +{
 +	struct hvc_opal_priv *pv = hvc_opal_privs[hp->vtermno];
 +
 +	pr_devel("HVSI@%x: Set modem control, set=%x,clr=%x\n",
 +		 hp->vtermno, set, clear);
 +
 +	if (set & TIOCM_DTR)
 +		hvsilib_write_mctrl(&pv->hvsi, 1);
 +	else if (clear & TIOCM_DTR)
 +		hvsilib_write_mctrl(&pv->hvsi, 0);
 +
 +	return 0;
 +}
 +
 +static const struct hv_ops hvc_opal_hvsi_ops = {
 +	.get_chars = hvc_opal_hvsi_get_chars,
 +	.put_chars = hvc_opal_hvsi_put_chars,
 +	.notifier_add = hvc_opal_hvsi_open,
 +	.notifier_del = hvc_opal_hvsi_close,
 +	.notifier_hangup = hvc_opal_hvsi_hangup,
 +	.tiocmget = hvc_opal_hvsi_tiocmget,
 +	.tiocmset = hvc_opal_hvsi_tiocmset,
 +};
 +
 +static int __devinit hvc_opal_probe(struct platform_device *dev)
 +{
 +	const struct hv_ops *ops;
 +	struct hvc_struct *hp;
 +	struct hvc_opal_priv *pv;
 +	hv_protocol_t proto;
 +	unsigned int termno, boot = 0;
 +	const __be32 *reg;
 +
 +	if (of_device_is_compatible(dev->dev.of_node, "ibm,opal-console-raw")) {
 +		proto = HV_PROTOCOL_RAW;
 +		ops = &hvc_opal_raw_ops;
 +	} else if (of_device_is_compatible(dev->dev.of_node,
 +					   "ibm,opal-console-hvsi")) {
 +		proto = HV_PROTOCOL_HVSI;
 +		ops = &hvc_opal_hvsi_ops;
 +	} else {
 +		pr_err("hvc_opal: Unkown protocol for %s\n",
 +		       dev->dev.of_node->full_name);
 +		return -ENXIO;
 +	}
 +
 +	reg = of_get_property(dev->dev.of_node, "reg", NULL);
 +	termno = reg ? be32_to_cpup(reg) : 0;
 +
 +	/* Is it our boot one ? */
 +	if (hvc_opal_privs[termno] == &hvc_opal_boot_priv) {
 +		pv = hvc_opal_privs[termno];
 +		boot = 1;
 +	} else if (hvc_opal_privs[termno] == NULL) {
 +		pv = kzalloc(sizeof(struct hvc_opal_priv), GFP_KERNEL);
 +		if (!pv)
 +			return -ENOMEM;
 +		pv->proto = proto;
 +		hvc_opal_privs[termno] = pv;
 +		if (proto == HV_PROTOCOL_HVSI)
 +			hvsilib_init(&pv->hvsi, opal_get_chars, opal_put_chars,
 +				     termno, 0);
 +
 +		/* Instanciate now to establish a mapping index==vtermno */
 +		hvc_instantiate(termno, termno, ops);
 +	} else {
 +		pr_err("hvc_opal: Device %s has duplicate terminal number #%d\n",
 +		       dev->dev.of_node->full_name, termno);
 +		return -ENXIO;
 +	}
 +
 +	pr_info("hvc%d: %s protocol on %s%s\n", termno,
 +		proto == HV_PROTOCOL_RAW ? "raw" : "hvsi",
 +		dev->dev.of_node->full_name,
 +		boot ? " (boot console)" : "");
 +
 +	/* We don't do IRQ yet */
 +	hp = hvc_alloc(termno, 0, ops, MAX_VIO_PUT_CHARS);
 +	if (IS_ERR(hp))
 +		return PTR_ERR(hp);
 +	dev_set_drvdata(&dev->dev, hp);
 +
 +	return 0;
 +}
 +
 +static int __devexit hvc_opal_remove(struct platform_device *dev)
 +{
 +	struct hvc_struct *hp = dev_get_drvdata(&dev->dev);
 +	int rc, termno;
 +
 +	termno = hp->vtermno;
 +	rc = hvc_remove(hp);
 +	if (rc == 0) {
 +		if (hvc_opal_privs[termno] != &hvc_opal_boot_priv)
 +			kfree(hvc_opal_privs[termno]);
 +		hvc_opal_privs[termno] = NULL;
 +	}
 +	return rc;
 +}
 +
 +static struct platform_driver hvc_opal_driver = {
 +	.probe		= hvc_opal_probe,
 +	.remove		= __devexit_p(hvc_opal_remove),
 +	.driver		= {
 +		.name	= hvc_opal_name,
 +		.owner	= THIS_MODULE,
 +		.of_match_table	= hvc_opal_match,
 +	}
 +};
 +
 +static int __init hvc_opal_init(void)
 +{
 +	if (!firmware_has_feature(FW_FEATURE_OPAL))
 +		return -ENODEV;
 +
 +	/* Register as a vio device to receive callbacks */
 +	return platform_driver_register(&hvc_opal_driver);
 +}
 +module_init(hvc_opal_init);
 +
 +static void __exit hvc_opal_exit(void)
 +{
 +	platform_driver_unregister(&hvc_opal_driver);
 +}
 +module_exit(hvc_opal_exit);
 +
 +static void udbg_opal_putc(char c)
 +{
 +	unsigned int termno = hvc_opal_boot_termno;
 +	int count = -1;
 +
 +	if (c == '\n')
 +		udbg_opal_putc('\r');
 +
 +	do {
 +		switch(hvc_opal_boot_priv.proto) {
 +		case HV_PROTOCOL_RAW:
 +			count = opal_put_chars(termno, &c, 1);
 +			break;
 +		case HV_PROTOCOL_HVSI:
 +			count = hvc_opal_hvsi_put_chars(termno, &c, 1);
 +			break;
 +		}
 +	} while(count == 0 || count == -EAGAIN);
 +}
 +
 +static int udbg_opal_getc_poll(void)
 +{
 +	unsigned int termno = hvc_opal_boot_termno;
 +	int rc = 0;
 +	char c;
 +
 +	switch(hvc_opal_boot_priv.proto) {
 +	case HV_PROTOCOL_RAW:
 +		rc = opal_get_chars(termno, &c, 1);
 +		break;
 +	case HV_PROTOCOL_HVSI:
 +		rc = hvc_opal_hvsi_get_chars(termno, &c, 1);
 +		break;
 +	}
 +	if (!rc)
 +		return -1;
 +	return c;
 +}
 +
 +static int udbg_opal_getc(void)
 +{
 +	int ch;
 +	for (;;) {
 +		ch = udbg_opal_getc_poll();
 +		if (ch == -1) {
 +			/* This shouldn't be needed...but... */
 +			volatile unsigned long delay;
 +			for (delay=0; delay < 2000000; delay++)
 +				;
 +		} else {
 +			return ch;
 +		}
 +	}
 +}
 +
 +static void udbg_init_opal_common(void)
 +{
 +	udbg_putc = udbg_opal_putc;
 +	udbg_getc = udbg_opal_getc;
 +	udbg_getc_poll = udbg_opal_getc_poll;
 +	tb_ticks_per_usec = 0x200; /* Make udelay not suck */
 +}
 +
 +void __init hvc_opal_init_early(void)
 +{
 +	struct device_node *stdout_node = NULL;
 +	const u32 *termno;
 +	const char *name = NULL;
 +	const struct hv_ops *ops;
 +	u32 index;
 +
 +	/* find the boot console from /chosen/stdout */
 +	if (of_chosen)
 +		name = of_get_property(of_chosen, "linux,stdout-path", NULL);
 +	if (name) {
 +		stdout_node = of_find_node_by_path(name);
 +		if (!stdout_node) {
 +			pr_err("hvc_opal: Failed to locate default console!\n");
 +			return;
 +		}
 +	} else {
 +		struct device_node *opal, *np;
 +
 +		/* Current OPAL takeover doesn't provide the stdout
 +		 * path, so we hard wire it
 +		 */
 +		opal = of_find_node_by_path("/ibm,opal/consoles");
 +		if (opal)
 +			pr_devel("hvc_opal: Found consoles in new location\n");
 +		if (!opal) {
 +			opal = of_find_node_by_path("/ibm,opal");
 +			if (opal)
 +				pr_devel("hvc_opal: "
 +					 "Found consoles in old location\n");
 +		}
 +		if (!opal)
 +			return;
 +		for_each_child_of_node(opal, np) {
 +			if (!strcmp(np->name, "serial")) {
 +				stdout_node = np;
 +				break;
 +			}
 +		}
 +		of_node_put(opal);
 +	}
 +	if (!stdout_node)
 +		return;
 +	termno = of_get_property(stdout_node, "reg", NULL);
 +	index = termno ? *termno : 0;
 +	if (index >= MAX_NR_HVC_CONSOLES)
 +		return;
 +	hvc_opal_privs[index] = &hvc_opal_boot_priv;
 +
 +	/* Check the protocol */
 +	if (of_device_is_compatible(stdout_node, "ibm,opal-console-raw")) {
 +		hvc_opal_boot_priv.proto = HV_PROTOCOL_RAW;
 +		ops = &hvc_opal_raw_ops;
 +		pr_devel("hvc_opal: Found RAW console\n");
 +	}
 +	else if (of_device_is_compatible(stdout_node,"ibm,opal-console-hvsi")) {
 +		hvc_opal_boot_priv.proto = HV_PROTOCOL_HVSI;
 +		ops = &hvc_opal_hvsi_ops;
 +		hvsilib_init(&hvc_opal_boot_priv.hvsi, opal_get_chars,
 +			     opal_put_chars, index, 1);
 +		/* HVSI, perform the handshake now */
 +		hvsilib_establish(&hvc_opal_boot_priv.hvsi);
 +		pr_devel("hvc_opal: Found HVSI console\n");
 +	} else
 +		goto out;
 +	hvc_opal_boot_termno = index;
 +	udbg_init_opal_common();
 +	add_preferred_console("hvc", index, NULL);
 +	hvc_instantiate(index, index, ops);
 +out:
 +	of_node_put(stdout_node);
 +}
 +
 +#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL_RAW
 +void __init udbg_init_debug_opal(void)
 +{
 +	u32 index = CONFIG_PPC_EARLY_DEBUG_OPAL_VTERMNO;
 +	hvc_opal_privs[index] = &hvc_opal_boot_priv;
 +	hvc_opal_boot_priv.proto = HV_PROTOCOL_RAW;
 +	hvc_opal_boot_termno = index;
 +	udbg_init_opal_common();
 +}
 +#endif /* CONFIG_PPC_EARLY_DEBUG_OPAL_RAW */
 +
 +#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL_HVSI
 +void __init udbg_init_debug_opal_hvsi(void)
 +{
 +	u32 index = CONFIG_PPC_EARLY_DEBUG_OPAL_VTERMNO;
 +	hvc_opal_privs[index] = &hvc_opal_boot_priv;
 +	hvc_opal_boot_termno = index;
 +	udbg_init_opal_common();
 +	hvsilib_init(&hvc_opal_boot_priv.hvsi, opal_get_chars, opal_put_chars,
 +		     index, 1);
 +	hvsilib_establish(&hvc_opal_boot_priv.hvsi);
 +}
 +#endif /* CONFIG_PPC_EARLY_DEBUG_OPAL_HVSI */
diff --cc include/linux/dmaengine.h
index ace51af4369f,1ceff5ae9d31..75f53f874b24
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@@ -24,8 -24,11 +24,10 @@@
  #include <linux/device.h>
  #include <linux/uio.h>
  #include <linux/dma-direction.h>
 +#include <linux/scatterlist.h>
+ #include <linux/bitmap.h>
+ #include <asm/page.h>
  
 -struct scatterlist;
 -
  /**
   * typedef dma_cookie_t - an opaque DMA cookie
   *
diff --cc net/8021q/vlan_core.c
index 163397f1fd5a,1f64cc9da1b0..f5ffc02729d6
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@@ -2,9 -2,10 +2,10 @@@
  #include <linux/netdevice.h>
  #include <linux/if_vlan.h>
  #include <linux/netpoll.h>
+ #include <linux/export.h>
  #include "vlan.h"
  
 -bool vlan_do_receive(struct sk_buff **skbp)
 +bool vlan_do_receive(struct sk_buff **skbp, bool last_handler)
  {
  	struct sk_buff *skb = *skbp;
  	u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK;